cleaning of subjects: avoid duplicated subjects, prioritise collected vs inferred or other sources

This commit is contained in:
Claudio Atzori 2022-08-12 15:09:16 +02:00
parent adb526b0e1
commit b7c387c21f
5 changed files with 101 additions and 15 deletions

View File

@ -1,6 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import java.time.LocalDate;
import java.time.ZoneId;
import java.time.format.DateTimeFormatter;
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
}
if (Objects.nonNull(r.getSubject())) {
r
.setSubject(
List<Subject> subjects = Lists
.newArrayList(
r
.getSubject()
.stream()
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(GraphCleaningFunctions::cleanValue)
.collect(Collectors.toList()));
.collect(
Collectors
.toMap(
s -> Optional
.ofNullable(s.getQualifier())
.map(q -> q.getClassid() + s.getValue())
.orElse(s.getValue()),
Function.identity(),
(s1, s2) -> Collections
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
.values());
r.setSubject(subjects);
}
if (Objects.nonNull(r.getTitle())) {
r
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.filter(p -> StringUtils.isNotBlank(p.getValue()))
.map(p -> {
// hack to distinguish orcid from orcid_pending
String pidProvenance = Optional
.ofNullable(p.getDataInfo())
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
String pidProvenance = getProvenance(p.getDataInfo());
if (p
.getQualifier()
.getClassid()

View File

@ -11,10 +11,10 @@ import java.util.function.Function;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
@ -503,4 +503,15 @@ public class OafMapperUtils {
rel.setProperties(properties);
return rel;
}
public static String getProvenance(DataInfo dataInfo) {
return Optional
.ofNullable(dataInfo)
.map(
d -> Optional
.ofNullable(d.getProvenanceaction())
.map(Qualifier::getClassid)
.orElse(""))
.orElse("");
}
}

View File

@ -0,0 +1,46 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
import static org.apache.commons.lang3.StringUtils.isBlank;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Subject;
public class SubjectProvenanceComparator implements Comparator<Subject> {
@Override
public int compare(Subject left, Subject right) {
String lProv = getProvenance(left.getDataInfo());
String rProv = getProvenance(right.getDataInfo());
if (isBlank(lProv) && isBlank(rProv))
return 0;
if (isBlank(lProv))
return 1;
if (isBlank(rProv))
return -1;
if (lProv.equals(rProv))
return 0;
if (lProv.toLowerCase().contains("crosswalk"))
return -1;
if (rProv.toLowerCase().contains("crosswalk"))
return 1;
if (lProv.toLowerCase().contains("user"))
return -1;
if (rProv.toLowerCase().contains("user"))
return 1;
if (lProv.toLowerCase().contains("propagation"))
return -1;
if (rProv.toLowerCase().contains("propagation"))
return 1;
if (lProv.toLowerCase().contains("iis"))
return -1;
if (rProv.toLowerCase().contains("iis"))
return 1;
return 0;
}
}

View File

@ -260,14 +260,16 @@ public class GraphCleaningFunctionsTest {
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(3, fos_subjects.size());
assertEquals(2, fos_subjects.size());
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
"sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid())
));
assertTrue(
fos_subjects

View File

@ -794,6 +794,28 @@
},
"value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,
"inferenceprovenance": "",
"inferred": false,
"invisible": false,
"provenanceaction": {
"classid": "subject:fos",
"classname": "subject:fos",
"schemeid": "dnet:provenanceActions",
"schemename": "dnet:provenanceActions"
},
"trust": "0.9"
},
"qualifier": {
"classid": "keyword",
"classname": "keyword",
"schemeid": "dnet:subject_classification_typologies",
"schemename": "dnet:subject_classification_typologies"
},
"value": "0101 mathematics"
},
{
"dataInfo": {
"deletedbyinference": false,