graph cleaning workflow separate orcid_pending from orcid, depending on the author pid provenance

This commit is contained in:
Claudio Atzori 2020-12-02 10:44:05 +01:00
parent 2d15667b4a
commit 57f448b7a4
4 changed files with 50 additions and 1 deletions

View File

@ -7,6 +7,9 @@ import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class ModelConstants { public class ModelConstants {
public static final String ORCID = "orcid";
public static final String ORCID_PENDING = "orcid_pending";
public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies"; public static final String DNET_SUBJECT_TYPOLOGIES = "dnet:subject_classification_typologies";
public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies"; public static final String DNET_RESULT_TYPOLOGIES = "dnet:result_typologies";
public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource"; public static final String DNET_PUBLICATION_RESOURCE = "dnet:publication_resource";

View File

@ -189,6 +189,14 @@ public class CleaningFunctions {
author.setRank(i++); author.setRank(i++);
} }
} }
final Set<String> collectedFrom = Optional
.ofNullable(r.getCollectedfrom())
.map(c -> c.stream()
.map(KeyValue::getKey)
.collect(Collectors.toCollection(HashSet::new)))
.orElse(new HashSet<>());
for (Author a : r.getAuthor()) { for (Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) { if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList()); a.setPid(Lists.newArrayList());
@ -201,13 +209,28 @@ public class CleaningFunctions {
.filter(p -> Objects.nonNull(p.getQualifier())) .filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue())) .filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> { .map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
if (pidProvenance.equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) {
p.getQualifier().setClassid(ModelConstants.ORCID);
} else {
p.getQualifier().setClassid(ModelConstants.ORCID_PENDING);
}
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, "")); p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p; return p;
}) })
.collect( .collect(
Collectors Collectors
.toMap( .toMap(
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, p -> p.getQualifier().getClassid() + p.getValue(),
Function.identity(),
(p1, p2) -> p1,
LinkedHashMap::new)) LinkedHashMap::new))
.values() .values()
.stream() .stream()

View File

@ -49,6 +49,28 @@
"schemename": "dnet:pid_types" "schemename": "dnet:pid_types"
}, },
"value": "https://orcid.org/0000-0001-9613-6639" "value": "https://orcid.org/0000-0001-9613-6639"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:entityregistry",
"classname": "sysimport:crosswalk:entityregistry",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "orcid",
"classname": "ORCID12",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "0000-0001-9613-6639"
} }
], ],
"rank": 1, "rank": 1,

View File

@ -1031,6 +1031,7 @@ dnet:pid_types @=@ dnet:pid_types @=@ jprn @=@ JPRN Identifier
dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier dnet:pid_types @=@ dnet:pid_types @=@ mag_id @=@ Microsoft Academic Graph Identifier
dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative dnet:pid_types @=@ dnet:pid_types @=@ oai @=@ Open Archives Initiative
dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID dnet:pid_types @=@ dnet:pid_types @=@ orcid @=@ Open Researcher and Contributor ID
dnet:pid_types @=@ dnet:pid_types @=@ orcid_pending @=@ Open Researcher and Contributor ID
dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA dnet:pid_types @=@ dnet:pid_types @=@ PANGAEA @=@ PANGAEA
dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format dnet:pid_types @=@ dnet:pid_types @=@ epo_nr_epodoc @=@ Patent application number in EPODOC format
dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN dnet:pid_types @=@ dnet:pid_types @=@ UNKNOWN @=@ UNKNOWN