forked from D-Net/dnet-hadoop
[cleaning] remiving authors without fullname or providing 'deactivated' keyword. Removing test test titles
This commit is contained in:
parent
1542196a33
commit
d1ca025b0b
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.schema.oaf;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
@ -22,6 +23,9 @@ public class CleaningFunctions {
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
|
||||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
||||||
|
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
||||||
|
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
||||||
|
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
||||||
|
|
||||||
static {
|
static {
|
||||||
PID_BLACKLIST.add("none");
|
PID_BLACKLIST.add("none");
|
||||||
|
@ -80,6 +84,36 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends Oaf> boolean filter(T value) {
|
||||||
|
if (value instanceof Datasource) {
|
||||||
|
// nothing to evaluate here
|
||||||
|
} else if (value instanceof Project) {
|
||||||
|
// nothing to evaluate here
|
||||||
|
} else if (value instanceof Organization) {
|
||||||
|
// nothing to evaluate here
|
||||||
|
} else if (value instanceof Relation) {
|
||||||
|
// nothing to clean here
|
||||||
|
} else if (value instanceof Result) {
|
||||||
|
|
||||||
|
Result r = (Result) value;
|
||||||
|
|
||||||
|
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value instanceof Publication) {
|
||||||
|
|
||||||
|
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
||||||
|
|
||||||
|
} else if (value instanceof OtherResearchProduct) {
|
||||||
|
|
||||||
|
} else if (value instanceof Software) {
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T cleanup(T value) {
|
public static <T extends Oaf> T cleanup(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
|
@ -124,6 +158,12 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
||||||
|
.filter(
|
||||||
|
sp -> sp
|
||||||
|
.getValue()
|
||||||
|
.toLowerCase()
|
||||||
|
.replaceAll(TITLE_FILTER_REGEX, "")
|
||||||
|
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
||||||
.map(CleaningFunctions::cleanValue)
|
.map(CleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
@ -199,16 +239,7 @@ public class CleaningFunctions {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getAuthor())) {
|
if (Objects.nonNull(r.getAuthor())) {
|
||||||
boolean nullRank = r
|
final List<Author> authors = Lists.newArrayList();
|
||||||
.getAuthor()
|
|
||||||
.stream()
|
|
||||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
|
||||||
if (nullRank) {
|
|
||||||
int i = 1;
|
|
||||||
for (Author author : r.getAuthor()) {
|
|
||||||
author.setRank(i++);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
|
@ -235,7 +266,26 @@ public class CleaningFunctions {
|
||||||
.stream()
|
.stream()
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
if (StringUtils.isBlank(a.getFullname())) {
|
||||||
|
if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) {
|
||||||
|
a.setFullname(a.getSurname() + ", " + a.getName());
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) {
|
||||||
|
authors.add(a);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean nullRank = authors
|
||||||
|
.stream()
|
||||||
|
.anyMatch(a -> Objects.isNull(a.getRank()));
|
||||||
|
if (nullRank) {
|
||||||
|
int i = 1;
|
||||||
|
for (Author author : authors) {
|
||||||
|
author.setRank(i++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
r.setAuthor(authors);
|
||||||
|
|
||||||
}
|
}
|
||||||
if (value instanceof Publication) {
|
if (value instanceof Publication) {
|
||||||
|
@ -252,6 +302,15 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean isValidAuthorName(Author a) {
|
||||||
|
return !Stream
|
||||||
|
.of(a.getFullname(), a.getName(), a.getSurname())
|
||||||
|
.filter(s -> s != null && !s.isEmpty())
|
||||||
|
.collect(Collectors.joining(""))
|
||||||
|
.toLowerCase()
|
||||||
|
.matches(INVALID_AUTHOR_REGEX);
|
||||||
|
}
|
||||||
|
|
||||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
||||||
return pids
|
return pids
|
||||||
.stream()
|
.stream()
|
||||||
|
|
|
@ -90,6 +90,7 @@ public class CleanGraphSparkJob {
|
||||||
.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> fixVocabularyNames(value), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> cleanup(value), Encoders.bean(clazz))
|
||||||
|
.filter((FilterFunction<T>) value -> filter(value))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
.option("compression", "gzip")
|
.option("compression", "gzip")
|
||||||
|
|
|
@ -67,6 +67,7 @@ public class CleaningFunctionTest {
|
||||||
|
|
||||||
assertNotNull(p_out.getPublisher());
|
assertNotNull(p_out.getPublisher());
|
||||||
assertNull(p_out.getPublisher().getValue());
|
assertNull(p_out.getPublisher().getValue());
|
||||||
|
|
||||||
assertEquals("und", p_out.getLanguage().getClassid());
|
assertEquals("und", p_out.getLanguage().getClassid());
|
||||||
assertEquals("Undetermined", p_out.getLanguage().getClassname());
|
assertEquals("Undetermined", p_out.getLanguage().getClassname());
|
||||||
|
|
||||||
|
@ -120,6 +121,9 @@ public class CleaningFunctionTest {
|
||||||
.isPresent());
|
.isPresent());
|
||||||
|
|
||||||
Publication p_cleaned = CleaningFunctions.cleanup(p_out);
|
Publication p_cleaned = CleaningFunctions.cleanup(p_out);
|
||||||
|
|
||||||
|
assertEquals(1, p_cleaned.getTitle().size());
|
||||||
|
|
||||||
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
||||||
assertNull(p_out.getPublisher());
|
assertNull(p_out.getPublisher());
|
||||||
|
|
||||||
|
|
|
@ -865,6 +865,28 @@
|
||||||
"schemename": "dnet:dataCite_title"
|
"schemename": "dnet:dataCite_title"
|
||||||
},
|
},
|
||||||
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
"value": "Optical response of strained- and unstrained-silicon cold-electron bolometers"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "main title",
|
||||||
|
"classname": "main title",
|
||||||
|
"schemeid": "dnet:dataCite_title",
|
||||||
|
"schemename": "dnet:dataCite_title"
|
||||||
|
},
|
||||||
|
"value": "test test 123 test"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
Loading…
Reference in New Issue