forked from D-Net/dnet-hadoop
[cleaning wf] fixed methodology to rule out invalid result titles, based on https://support.openaire.eu/issues/7206
This commit is contained in:
parent
49f897ef29
commit
82a4e4efae
|
@ -27,7 +27,10 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
public static final int ORCID_LEN = 19;
|
public static final int ORCID_LEN = 19;
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||||
public static final String TITLE_FILTER_REGEX = "(test)|\\W|\\d";
|
|
||||||
|
public static final String TITLE_TEST = "test";
|
||||||
|
public static final String TITLE_FILTER_REGEX = String.format("(%s)|\\W|\\d", TITLE_TEST);
|
||||||
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 5;
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
|
@ -195,10 +198,16 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
final String title = sp
|
final String title = sp
|
||||||
.getValue()
|
.getValue()
|
||||||
.toLowerCase();
|
.toLowerCase();
|
||||||
final String residual = Unidecode
|
final String decoded = Unidecode.decode(title);
|
||||||
.decode(title)
|
|
||||||
.replaceAll(TITLE_FILTER_REGEX, "");
|
if (StringUtils.contains(decoded, TITLE_TEST)) {
|
||||||
return residual.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
return decoded
|
||||||
|
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||||
|
.length() > TITLE_FILTER_RESIDUAL_LENGTH;
|
||||||
|
}
|
||||||
|
return !decoded
|
||||||
|
.replaceAll("\\W|\\d", "")
|
||||||
|
.isEmpty();
|
||||||
})
|
})
|
||||||
.map(GraphCleaningFunctions::cleanValue)
|
.map(GraphCleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
|
|
|
@ -8,6 +8,7 @@ import java.io.IOException;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
@ -137,9 +138,21 @@ public class GraphCleaningFunctionsTest {
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
||||||
|
|
||||||
|
assertEquals(5, p_out.getTitle().size());
|
||||||
|
|
||||||
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
||||||
|
|
||||||
assertEquals(1, p_cleaned.getTitle().size());
|
assertEquals(3, p_cleaned.getTitle().size());
|
||||||
|
|
||||||
|
List<String> titles = p_cleaned
|
||||||
|
.getTitle()
|
||||||
|
.stream()
|
||||||
|
.map(StructuredProperty::getValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
assertTrue(titles.contains("omic"));
|
||||||
|
assertTrue(
|
||||||
|
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
||||||
|
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
||||||
|
|
||||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||||
assertNull(p_out.getPublisher());
|
assertNull(p_out.getPublisher());
|
||||||
|
|
|
@ -864,7 +864,7 @@
|
||||||
"schemeid": "dnet:dataCite_title",
|
"schemeid": "dnet:dataCite_title",
|
||||||
"schemename": "dnet:dataCite_title"
|
"schemename": "dnet:dataCite_title"
|
||||||
},
|
},
|
||||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers test"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"dataInfo": {
|
"dataInfo": {
|
||||||
|
@ -887,6 +887,72 @@
|
||||||
"schemename": "dnet:dataCite_title"
|
"schemename": "dnet:dataCite_title"
|
||||||
},
|
},
|
||||||
"value": "test test 123 test"
|
"value": "test test 123 test"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "omic"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "-"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
Loading…
Reference in New Issue