forked from antonis.lempesis/dnet-hadoop
Merge branch 'beta' of https://code-repo.d4science.org/D-Net/dnet-hadoop into beta
This commit is contained in:
commit
936578aaf1
|
@ -57,9 +57,17 @@ public class VocabularyGroup implements Serializable {
|
||||||
final String syn = arr[2].trim();
|
final String syn = arr[2].trim();
|
||||||
|
|
||||||
vocs.addSynonyms(vocId, termId, syn);
|
vocs.addSynonyms(vocId, termId, syn);
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// add the term names as synonyms
|
||||||
|
vocs.vocs.values().forEach(voc -> {
|
||||||
|
voc.getTerms().values().forEach(term -> {
|
||||||
|
voc.addSynonym(term.getName().toLowerCase(), term.getId());
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
return vocs;
|
return vocs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,6 +16,8 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
@ -131,7 +133,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T extends Oaf> T cleanup(T value) {
|
public static <T extends Oaf> T cleanup(T value, VocabularyGroup vocs) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
} else if (value instanceof Project) {
|
} else if (value instanceof Project) {
|
||||||
|
@ -250,6 +252,38 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
if (Objects.nonNull(r.getInstance())) {
|
if (Objects.nonNull(r.getInstance())) {
|
||||||
|
|
||||||
for (Instance i : r.getInstance()) {
|
for (Instance i : r.getInstance()) {
|
||||||
|
if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) {
|
||||||
|
if (r instanceof Publication) {
|
||||||
|
i
|
||||||
|
.setInstancetype(
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||||
|
} else if (r instanceof Dataset) {
|
||||||
|
i
|
||||||
|
.setInstancetype(
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||||
|
} else if (r instanceof Software) {
|
||||||
|
i
|
||||||
|
.setInstancetype(
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||||
|
} else if (r instanceof OtherResearchProduct) {
|
||||||
|
i
|
||||||
|
.setInstancetype(
|
||||||
|
OafMapperUtils
|
||||||
|
.qualifier(
|
||||||
|
"0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE,
|
||||||
|
ModelConstants.DNET_PUBLICATION_RESOURCE));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (Objects.nonNull(i.getPid())) {
|
if (Objects.nonNull(i.getPid())) {
|
||||||
i.setPid(processPidCleaning(i.getPid()));
|
i.setPid(processPidCleaning(i.getPid()));
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,7 +88,7 @@ public class CleanGraphSparkJob {
|
||||||
readTableFromPath(spark, inputPath, clazz)
|
readTableFromPath(spark, inputPath, clazz)
|
||||||
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) GraphCleaningFunctions::fixVocabularyNames, Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> OafCleaner.apply(value, mapping), Encoders.bean(clazz))
|
||||||
.map((MapFunction<T, T>) GraphCleaningFunctions::cleanup, Encoders.bean(clazz))
|
.map((MapFunction<T, T>) value -> GraphCleaningFunctions.cleanup(value, vocs), Encoders.bean(clazz))
|
||||||
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
|
.filter((FilterFunction<T>) GraphCleaningFunctions::filter)
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Overwrite)
|
.mode(SaveMode.Overwrite)
|
||||||
|
|
|
@ -151,6 +151,12 @@ public class GraphCleaningFunctionsTest {
|
||||||
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
||||||
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid());
|
||||||
|
assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname());
|
||||||
|
|
||||||
|
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
|
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
|
||||||
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
|
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
|
||||||
|
|
||||||
|
@ -164,7 +170,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
|
|
||||||
List<Instance> poi = p_out.getInstance();
|
List<Instance> poi = p_out.getInstance();
|
||||||
assertNotNull(poi);
|
assertNotNull(poi);
|
||||||
assertEquals(1, poi.size());
|
assertEquals(3, poi.size());
|
||||||
|
|
||||||
final Instance poii = poi.get(0);
|
final Instance poii = poi.get(0);
|
||||||
assertNotNull(poii);
|
assertNotNull(poii);
|
||||||
|
@ -192,7 +198,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
|
|
||||||
assertEquals(5, p_out.getTitle().size());
|
assertEquals(5, p_out.getTitle().size());
|
||||||
|
|
||||||
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
|
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
||||||
|
|
||||||
assertEquals(3, p_cleaned.getTitle().size());
|
assertEquals(3, p_cleaned.getTitle().size());
|
||||||
|
|
||||||
|
@ -211,9 +217,12 @@ public class GraphCleaningFunctionsTest {
|
||||||
|
|
||||||
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
|
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
|
||||||
|
|
||||||
|
assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid());
|
||||||
|
assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname());
|
||||||
|
|
||||||
final List<Instance> pci = p_cleaned.getInstance();
|
final List<Instance> pci = p_cleaned.getInstance();
|
||||||
assertNotNull(pci);
|
assertNotNull(pci);
|
||||||
assertEquals(1, pci.size());
|
assertEquals(3, pci.size());
|
||||||
|
|
||||||
final Instance pcii = pci.get(0);
|
final Instance pcii = pci.get(0);
|
||||||
assertNotNull(pcii);
|
assertNotNull(pcii);
|
||||||
|
@ -281,7 +290,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
|
||||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
||||||
Publication cleaned = GraphCleaningFunctions.cleanup(p_out);
|
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
||||||
|
|
||||||
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
||||||
}
|
}
|
||||||
|
@ -292,7 +301,7 @@ public class GraphCleaningFunctionsTest {
|
||||||
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
|
||||||
Publication p_in = MAPPER.readValue(json, Publication.class);
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
||||||
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
||||||
Publication cleaned = GraphCleaningFunctions.cleanup(p_out);
|
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
||||||
|
|
||||||
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
||||||
|
|
||||||
|
|
|
@ -708,7 +708,7 @@ class MappersTest {
|
||||||
assertEquals(1, p.getTitle().size());
|
assertEquals(1, p.getTitle().size());
|
||||||
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
|
||||||
|
|
||||||
final Publication p_cleaned = cleanup(fixVocabularyNames(p));
|
final Publication p_cleaned = cleanup(fixVocabularyNames(p), vocs);
|
||||||
|
|
||||||
assertNotNull(p_cleaned.getTitle());
|
assertNotNull(p_cleaned.getTitle());
|
||||||
assertFalse(p_cleaned.getTitle().isEmpty());
|
assertFalse(p_cleaned.getTitle().isEmpty());
|
||||||
|
|
|
@ -403,6 +403,178 @@
|
||||||
"http://juuli.fi/Record/0275158616",
|
"http://juuli.fi/Record/0275158616",
|
||||||
"http://dx.doi.org/10.1007/s109090161569x"
|
"http://dx.doi.org/10.1007/s109090161569x"
|
||||||
]
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pid": [
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1002/s21010127267xy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1008/abcd"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"alternateIdentifier": [
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1007/s109090161569x"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1009/qwerty"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"accessright": {
|
||||||
|
"classid": "CLOSED",
|
||||||
|
"classname": "CLOSED",
|
||||||
|
"schemeid": "dnet:access_modes",
|
||||||
|
"schemename": "dnet:access_modes"
|
||||||
|
},
|
||||||
|
"collectedfrom": {
|
||||||
|
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||||
|
"value": "VIRTA"
|
||||||
|
},
|
||||||
|
"dateofacceptance": {
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"value": "2016-01-01"
|
||||||
|
},
|
||||||
|
"distributionlocation": "",
|
||||||
|
"hostedby": {
|
||||||
|
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||||
|
"value": "VIRTA"
|
||||||
|
},
|
||||||
|
"instancetype": {
|
||||||
|
"classid": "Model",
|
||||||
|
"classname": "Model",
|
||||||
|
"schemeid": "dnet:publication_resource",
|
||||||
|
"schemename": "dnet:publication_resource"
|
||||||
|
},
|
||||||
|
"url": [
|
||||||
|
"http://dx.doi.org/10.1002/s21010127267xy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"pid": [
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1002/s21010127267xy"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1008/abcd"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"alternateIdentifier": [
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1007/s109090161569x"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": null,
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "doi",
|
||||||
|
"classname": "doi",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "10.1009/qwerty"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"accessright": {
|
||||||
|
"classid": "CLOSED",
|
||||||
|
"classname": "CLOSED",
|
||||||
|
"schemeid": "dnet:access_modes",
|
||||||
|
"schemename": "dnet:access_modes"
|
||||||
|
},
|
||||||
|
"collectedfrom": {
|
||||||
|
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||||
|
"value": "VIRTA"
|
||||||
|
},
|
||||||
|
"dateofacceptance": {
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"classname": "sysimport:crosswalk:datasetarchive",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"value": "2016-01-01"
|
||||||
|
},
|
||||||
|
"distributionlocation": "",
|
||||||
|
"hostedby": {
|
||||||
|
"key": "10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747",
|
||||||
|
"value": "VIRTA"
|
||||||
|
},
|
||||||
|
"instancetype": {
|
||||||
|
"classid": "xyz",
|
||||||
|
"classname": "xyz",
|
||||||
|
"schemeid": "dnet:publication_resource",
|
||||||
|
"schemename": "dnet:publication_resource"
|
||||||
|
},
|
||||||
|
"url": [
|
||||||
|
"http://dx.doi.org/10.1002/t32121238378t"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"journal": {
|
"journal": {
|
||||||
|
|
|
@ -1212,6 +1212,17 @@ public class XmlRecordFactory implements Serializable {
|
||||||
"processingchargecurrency", instance.getProcessingchargecurrency()));
|
"processingchargecurrency", instance.getProcessingchargecurrency()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (instance.getLicense() != null) {
|
||||||
|
fields
|
||||||
|
.addAll(
|
||||||
|
instance
|
||||||
|
.getLicense()
|
||||||
|
.stream()
|
||||||
|
.filter(d -> isNotBlank(d))
|
||||||
|
.map(d -> XmlSerializationUtils.asXmlElement("license", d))
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
|
|
||||||
children
|
children
|
||||||
.add(
|
.add(
|
||||||
templateFactory
|
templateFactory
|
||||||
|
@ -1282,6 +1293,7 @@ public class XmlRecordFactory implements Serializable {
|
||||||
.collect(Collectors.groupingBy(ImmutablePair::getLeft))
|
.collect(Collectors.groupingBy(ImmutablePair::getLeft))
|
||||||
.values()
|
.values()
|
||||||
.stream()
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
.map(this::mergeInstances);
|
.map(this::mergeInstances);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -541,7 +541,7 @@
|
||||||
},
|
},
|
||||||
"trust": ""
|
"trust": ""
|
||||||
},
|
},
|
||||||
"value": ""
|
"value": "CC-BY"
|
||||||
},
|
},
|
||||||
"url": [
|
"url": [
|
||||||
"http://dx.doi.org/10.1109/TED.2018.2853550"
|
"http://dx.doi.org/10.1109/TED.2018.2853550"
|
||||||
|
|
Loading…
Reference in New Issue