[graph cleaning] improved instance type defaults #172

Merged
claudio.atzori merged 2 commits from graph_cleaning into beta 2021-12-09 16:47:07 +01:00
5 changed files with 138 additions and 12 deletions

View File

@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -131,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
return true; return true;
} }
public static <T extends Oaf> T cleanup(T value) { public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -250,6 +252,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
if (Objects.nonNull(r.getInstance())) { if (Objects.nonNull(r.getInstance())) {
for (Instance i : r.getInstance()) { for (Instance i : r.getInstance()) {
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
if (r instanceof Publication) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Dataset) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof Software) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
} else if (r instanceof OtherResearchProduct) {
i
.setInstancetype(
OafMapperUtils
.qualifier(
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
ModelConstants.DNET_PUBLICATION_RESOURCE));
}
}
if (Objects.nonNull(i.getPid())) { if (Objects.nonNull(i.getPid())) {
i.setPid(processPidCleaning(i.getPid())); i.setPid(processPidCleaning(i.getPid()));
} }

View File

@ -88,7 +88,7 @@ public class CleanGraphSparkJob {
readTableFromPath(spark, inputPath, clazz) readTableFromPath(spark, inputPath, clazz)
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz)) .map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
.map((MapFunction<T, T>) GraphCleaningFunctions::cleanup, Encoders.bean(clazz)) .map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
.filter((FilterFunction<T>) GraphCleaningFunctions::filter) .filter((FilterFunction<T>) GraphCleaningFunctions::filter)
.write() .write()
.mode(SaveMode.Overwrite) .mode(SaveMode.Overwrite)

View File

@ -151,8 +151,11 @@ public class GraphCleaningFunctionsTest {
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid()); assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname()); assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
assertEquals("0033", p_out.getInstance().get(1).getInstancetype().getClassid()); assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid());
assertEquals("Audiovisual", p_out.getInstance().get(1).getInstancetype().getClassname()); assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid());
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname());
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid()); assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname()); assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
@ -167,7 +170,7 @@ public class GraphCleaningFunctionsTest {
List<Instance> poi = p_out.getInstance(); List<Instance> poi = p_out.getInstance();
assertNotNull(poi); assertNotNull(poi);
assertEquals(2, poi.size()); assertEquals(3, poi.size());
final Instance poii = poi.get(0); final Instance poii = poi.get(0);
assertNotNull(poii); assertNotNull(poii);
@ -195,7 +198,7 @@ public class GraphCleaningFunctionsTest {
assertEquals(5, p_out.getTitle().size()); assertEquals(5, p_out.getTitle().size());
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
assertEquals(3, p_cleaned.getTitle().size()); assertEquals(3, p_cleaned.getTitle().size());
@ -214,9 +217,12 @@ public class GraphCleaningFunctionsTest {
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue()); assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid());
assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname());
final List<Instance> pci = p_cleaned.getInstance(); final List<Instance> pci = p_cleaned.getInstance();
assertNotNull(pci); assertNotNull(pci);
assertEquals(2, pci.size()); assertEquals(3, pci.size());
final Instance pcii = pci.get(0); final Instance pcii = pci.get(0);
assertNotNull(pcii); assertNotNull(pcii);
@ -284,7 +290,7 @@ public class GraphCleaningFunctionsTest {
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json")); .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out); Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
} }
@ -295,7 +301,7 @@ public class GraphCleaningFunctionsTest {
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json")); .toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
Publication p_in = MAPPER.readValue(json, Publication.class); Publication p_in = MAPPER.readValue(json, Publication.class);
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping); Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
Publication cleaned = GraphCleaningFunctions.cleanup(p_out); Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned)); Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));

View File

@ -708,7 +708,7 @@ class MappersTest {
assertEquals(1, p.getTitle().size()); assertEquals(1, p.getTitle().size());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
final Publication p_cleaned = cleanup(fixVocabularyNames(p)); final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs);
assertNotNull(p_cleaned.getTitle()); assertNotNull(p_cleaned.getTitle());
assertFalse(p_cleaned.getTitle().isEmpty()); assertFalse(p_cleaned.getTitle().isEmpty());

View File

@ -481,14 +481,100 @@
"value": "VIRTA" "value": "VIRTA"
}, },
"instancetype": { "instancetype": {
"classid": "Audiovisual", "classid": "Model",
"classname": "Audiovisual", "classname": "Model",
"schemeid": "dnet:publication_resource", "schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource" "schemename": "dnet:publication_resource"
}, },
"url": [ "url": [
"http://dx.doi.org/10.1002/s21010127267xy" "http://dx.doi.org/10.1002/s21010127267xy"
] ]
},
{
"pid": [
{
"dataInfo": null,
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1002/s21010127267xy"
},
{
"dataInfo": null,
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1008/abcd"
}
],
"alternateIdentifier": [
{
"dataInfo": null,
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1007/s109090161569x"
},
{
"dataInfo": null,
"qualifier": {
"classid": "doi",
"classname": "doi",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "10.1009/qwerty"
}
],
"accessright": {
"classid": "CLOSED",
"classname": "CLOSED",
"schemeid": "dnet:access_modes",
"schemename": "dnet:access_modes"
},
"collectedfrom": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"dateofacceptance": {
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"value": "2016-01-01"
},
"distributionlocation": "",
"hostedby": {
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
"value": "VIRTA"
},
"instancetype": {
"classid": "xyz",
"classname": "xyz",
"schemeid": "dnet:publication_resource",
"schemename": "dnet:publication_resource"
},
"url": [
"http://dx.doi.org/10.1002/t32121238378t"
]
} }
], ],
"journal": { "journal": {