author pids made unique by value

This commit is contained in:
Claudio Atzori 2020-10-01 12:50:40 +02:00
parent e265c3e125
commit 2e9e13444d
2 changed files with 47 additions and 2 deletions

View File

@ -1,9 +1,12 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.oa.graph.clean;
import java.util.LinkedHashMap;
import java.util.Objects; import java.util.Objects;
import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.clearspring.analytics.util.Lists;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper; import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
@ -13,6 +16,8 @@ import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions { public class CleaningFunctions {
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
@ -139,6 +144,25 @@ public class CleaningFunctions {
author.setRank(i++); author.setRank(i++);
} }
} }
for(Author a : r.getAuthor()) {
if (Objects.isNull(a.getPid())) {
a.setPid(Lists.newArrayList());
} else {
a.setPid(
a.getPid().stream()
.filter(p -> Objects.nonNull(p.getQualifier()))
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
return p;
})
.collect(Collectors.toMap(StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1, LinkedHashMap::new))
.values()
.stream()
.collect(Collectors.toList()));
}
}
} }
if (value instanceof Publication) { if (value instanceof Publication) {

View File

@ -27,6 +27,28 @@
"schemename": "dnet:pid_types" "schemename": "dnet:pid_types"
}, },
"value": "0000-0001-9613-6639" "value": "0000-0001-9613-6639"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "sysimport:crosswalk:datasetarchive",
"classname": "sysimport:crosswalk:datasetarchive",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "ORCID12",
"classname": "ORCID12",
"schemeid": "dnet:pid_types",
"schemename": "dnet:pid_types"
},
"value": "https://orcid.org/0000-0001-9613-6639"
} }
], ],
"rank": 1, "rank": 1,
@ -91,8 +113,7 @@
], ],
"fullname": "Barry, Peter S.", "fullname": "Barry, Peter S.",
"name": "Peter S.", "name": "Peter S.",
"pid": [ "pid": null,
],
"rank": 3, "rank": 3,
"surname": "Barry" "surname": "Barry"
}, },