bulk cleaning when found null or empty, sets bestaccessrights evaluating the result instances

This commit is contained in:
Claudio Atzori 2020-07-08 17:53:35 +02:00
parent 853e8d7987
commit 67e1d222b6
4 changed files with 21 additions and 12 deletions

View File

@ -8,6 +8,7 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -97,7 +98,7 @@ public class CleanGraphSparkJob {
.json(outputPath); .json(outputPath);
} }
private static <T extends Oaf> T fixDefaults(T value) { protected static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -134,11 +135,6 @@ public class CleanGraphSparkJob {
.setResourcetype( .setResourcetype(
qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); qualifier("UNKNOWN", "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
} }
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
r
.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
}
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
@ -152,6 +148,15 @@ public class CleanGraphSparkJob {
} }
} }
} }
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) {
r.setBestaccessright(
qualifier("UNKNOWN", "not available", ModelConstants.DNET_ACCESS_MODES));
} else {
r.setBestaccessright(bestaccessrights);
}
}
if (Objects.nonNull(r.getAuthor())) { if (Objects.nonNull(r.getAuthor())) {
boolean nullRank = r boolean nullRank = r
.getAuthor() .getAuthor()

View File

@ -378,6 +378,10 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) { protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) { if (instanceList != null) {
final Optional<Qualifier> min = instanceList final Optional<Qualifier> min = instanceList

View File

@ -57,6 +57,8 @@ public class CleaningFunctionTest {
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json")); String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
assertNull(p_in.getBestaccessright());
assertTrue(p_in instanceof Result); assertTrue(p_in instanceof Result);
assertTrue(p_in instanceof Publication); assertTrue(p_in instanceof Publication);
@ -84,6 +86,9 @@ public class CleaningFunctionTest {
.map(p -> p.getQualifier()) .map(p -> p.getQualifier())
.allMatch(q -> pidTerms.contains(q.getClassid()))); .allMatch(q -> pidTerms.contains(q.getClassid())));
Publication p_defaults = CleanGraphSparkJob.fixDefaults(p_out);
assertEquals("CLOSED", p_defaults.getBestaccessright().getClassid());
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_out)); System.out.println(MAPPER.writeValueAsString(p_out));

View File

@ -185,12 +185,7 @@
"surname": "" "surname": ""
} }
], ],
"bestaccessright": { "bestaccessright": null,
"classid": "CLOSED",
"classname": "Closed Access",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": [ "collectedfrom": [
{ {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747", "key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",