forked from antonis.lempesis/dnet-hadoop
graph cleaning workflow separate orcid_pending from orcid, depending on the author pid provenance
This commit is contained in:
parent
2d15667b4a
commit
57f448b7a4
|
@ -7,6 +7,9 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
public class ModelConstants {
|
public class ModelConstants {
|
||||||
|
|
||||||
|
public static final String ORCID = "orcid";
|
||||||
|
public static final String ORCID_PENDING = "orcid_pending";
|
||||||
|
|
||||||
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
|
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
|
||||||
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
|
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
|
||||||
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
|
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";
|
||||||
|
|
|
@ -189,6 +189,14 @@ public class CleaningFunctions {
|
||||||
author.setRank(i++);
|
author.setRank(i++);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
final Set<String> collectedFrom = Optional
|
||||||
|
.ofNullable(r.getCollectedfrom())
|
||||||
|
.map(c -> c.stream()
|
||||||
|
.map(KeyValue::getKey)
|
||||||
|
.collect(Collectors.toCollection(HashSet::new)))
|
||||||
|
.orElse(new HashSet<>());
|
||||||
|
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
if (Objects.isNull(a.getPid())) {
|
if (Objects.isNull(a.getPid())) {
|
||||||
a.setPid(Lists.newArrayList());
|
a.setPid(Lists.newArrayList());
|
||||||
|
@ -201,13 +209,28 @@ public class CleaningFunctions {
|
||||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
.filter(p -> Objects.nonNull(p.getQualifier()))
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
.map(p -> {
|
.map(p -> {
|
||||||
|
// hack to distinguish orcid from orcid_pending
|
||||||
|
String pidProvenance = Optional
|
||||||
|
.ofNullable(p.getDataInfo())
|
||||||
|
.map(d -> Optional
|
||||||
|
.ofNullable(d.getProvenanceaction())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(""))
|
||||||
|
.orElse("");
|
||||||
|
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
|
||||||
|
p.getQualifier().setClassid(ModelConstants.ORCID);
|
||||||
|
} else {
|
||||||
|
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
|
||||||
|
}
|
||||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
||||||
return p;
|
return p;
|
||||||
})
|
})
|
||||||
.collect(
|
.collect(
|
||||||
Collectors
|
Collectors
|
||||||
.toMap(
|
.toMap(
|
||||||
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
p -> p.getQualifier().getClassid() + p.getValue(),
|
||||||
|
Function.identity(),
|
||||||
|
(p1, p2) -> p1,
|
||||||
LinkedHashMap::new))
|
LinkedHashMap::new))
|
||||||
.values()
|
.values()
|
||||||
.stream()
|
.stream()
|
||||||
|
|
|
@ -49,6 +49,28 @@
|
||||||
"schemename": "dnet:pid_types"
|
"schemename": "dnet:pid_types"
|
||||||
},
|
},
|
||||||
"value": "https://orcid.org/0000-0001-9613-6639"
|
"value": "https://orcid.org/0000-0001-9613-6639"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "sysimport:crosswalk:entityregistry",
|
||||||
|
"classname": "sysimport:crosswalk:entityregistry",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "orcid",
|
||||||
|
"classname": "ORCID12",
|
||||||
|
"schemeid": "dnet:pid_types",
|
||||||
|
"schemename": "dnet:pid_types"
|
||||||
|
},
|
||||||
|
"value": "0000-0001-9613-6639"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"rank": 1,
|
"rank": 1,
|
||||||
|
|
|
@ -1031,6 +1031,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ jprn @=@ JPRN Identifier
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier
|
dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative
|
dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID
|
dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID
|
||||||
|
dnet:pid_types @=@ dnet:pid_types @=@ orcid_pending @=@ Open Researcher and Contributor ID
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA
|
dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format
|
dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format
|
||||||
dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN
|
dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN
|
||||||
|
|
Loading…
Reference in New Issue