forked from D-Net/dnet-hadoop
[cleaning wf] fixed methodology to rule out invalid result titles, based on https://support.openaire.eu/issues/7206
This commit is contained in:
parent
49f897ef29
commit
82a4e4efae
|
@ -27,7 +27,10 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
public static final int ORCID_LEN = 19;
|
||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||
public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d";
|
||||
|
||||
public static final String TITLE_TEST = "test";
|
||||
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
|
||||
|
||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||
|
||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||
|
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
final String title = sp
|
||||
.getValue()
|
||||
.toLowerCase();
|
||||
final String residual = Unidecode
|
||||
.decode(title)
|
||||
.replaceAll(TITLE_FILTER_REGEX, "");
|
||||
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
final String decoded = Unidecode.decode(title);
|
||||
|
||||
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||
return decoded
|
||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||
}
|
||||
return !decoded
|
||||
.replaceAll("\\W|\\d", "")
|
||||
.isEmpty();
|
||||
})
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
|
|
|
@ -8,6 +8,7 @@ import java.io.IOException;
|
|||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest {
|
|||
.stream()
|
||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||
|
||||
assertEquals(5, p_out.getTitle().size());
|
||||
|
||||
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
||||
|
||||
assertEquals(1, p_cleaned.getTitle().size());
|
||||
assertEquals(3, p_cleaned.getTitle().size());
|
||||
|
||||
List<String> titles = p_cleaned
|
||||
.getTitle()
|
||||
.stream()
|
||||
.map(StructuredProperty::getValue)
|
||||
.collect(Collectors.toList());
|
||||
assertTrue(titles.contains("omic"));
|
||||
assertTrue(
|
||||
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
||||
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
||||
|
||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||
assertNull(p_out.getPublisher());
|
||||
|
|
|
@ -864,7 +864,7 @@
|
|||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
|
@ -887,6 +887,72 @@
|
|||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "test test 123 test"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "omic"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "sysimport:crosswalk:datasetarchive",
|
||||
"classname": "sysimport:crosswalk:datasetarchive",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "main title",
|
||||
"classname": "main title",
|
||||
"schemeid": "dnet:dataCite_title",
|
||||
"schemename": "dnet:dataCite_title"
|
||||
},
|
||||
"value": "-"
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in New Issue