forked from D-Net/dnet-hadoop
cleaning of subjects: avoid duplicated subjects, prioritise collected vs inferred or other sources
This commit is contained in:
parent
adb526b0e1
commit
b7c387c21f
|
@ -1,6 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||||
|
|
||||||
import java.time.LocalDate;
|
import java.time.LocalDate;
|
||||||
import java.time.ZoneId;
|
import java.time.ZoneId;
|
||||||
import java.time.format.DateTimeFormatter;
|
import java.time.format.DateTimeFormatter;
|
||||||
|
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getSubject())) {
|
if (Objects.nonNull(r.getSubject())) {
|
||||||
r
|
List<Subject> subjects = Lists
|
||||||
.setSubject(
|
.newArrayList(
|
||||||
r
|
r
|
||||||
.getSubject()
|
.getSubject()
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.map(GraphCleaningFunctions::cleanValue)
|
.map(GraphCleaningFunctions::cleanValue)
|
||||||
.collect(Collectors.toList()));
|
.collect(
|
||||||
|
Collectors
|
||||||
|
.toMap(
|
||||||
|
s -> Optional
|
||||||
|
.ofNullable(s.getQualifier())
|
||||||
|
.map(q -> q.getClassid() + s.getValue())
|
||||||
|
.orElse(s.getValue()),
|
||||||
|
Function.identity(),
|
||||||
|
(s1, s2) -> Collections
|
||||||
|
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
|
||||||
|
.values());
|
||||||
|
r.setSubject(subjects);
|
||||||
}
|
}
|
||||||
if (Objects.nonNull(r.getTitle())) {
|
if (Objects.nonNull(r.getTitle())) {
|
||||||
r
|
r
|
||||||
|
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||||
.map(p -> {
|
.map(p -> {
|
||||||
// hack to distinguish orcid from orcid_pending
|
// hack to distinguish orcid from orcid_pending
|
||||||
String pidProvenance = Optional
|
String pidProvenance = getProvenance(p.getDataInfo());
|
||||||
.ofNullable(p.getDataInfo())
|
|
||||||
.map(
|
|
||||||
d -> Optional
|
|
||||||
.ofNullable(d.getProvenanceaction())
|
|
||||||
.map(Qualifier::getClassid)
|
|
||||||
.orElse(""))
|
|
||||||
.orElse("");
|
|
||||||
if (p
|
if (p
|
||||||
.getQualifier()
|
.getQualifier()
|
||||||
.getClassid()
|
.getClassid()
|
||||||
|
|
|
@ -11,10 +11,10 @@ import java.util.function.Function;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
@ -503,4 +503,15 @@ public class OafMapperUtils {
|
||||||
rel.setProperties(properties);
|
rel.setProperties(properties);
|
||||||
return rel;
|
return rel;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static String getProvenance(DataInfo dataInfo) {
|
||||||
|
return Optional
|
||||||
|
.ofNullable(dataInfo)
|
||||||
|
.map(
|
||||||
|
d -> Optional
|
||||||
|
.ofNullable(d.getProvenanceaction())
|
||||||
|
.map(Qualifier::getClassid)
|
||||||
|
.orElse(""))
|
||||||
|
.orElse("");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,46 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||||
|
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||||
|
|
||||||
|
public class SubjectProvenanceComparator implements Comparator<Subject> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Subject left, Subject right) {
|
||||||
|
|
||||||
|
String lProv = getProvenance(left.getDataInfo());
|
||||||
|
String rProv = getProvenance(right.getDataInfo());
|
||||||
|
|
||||||
|
if (isBlank(lProv) && isBlank(rProv))
|
||||||
|
return 0;
|
||||||
|
if (isBlank(lProv))
|
||||||
|
return 1;
|
||||||
|
if (isBlank(rProv))
|
||||||
|
return -1;
|
||||||
|
if (lProv.equals(rProv))
|
||||||
|
return 0;
|
||||||
|
if (lProv.toLowerCase().contains("crosswalk"))
|
||||||
|
return -1;
|
||||||
|
if (rProv.toLowerCase().contains("crosswalk"))
|
||||||
|
return 1;
|
||||||
|
if (lProv.toLowerCase().contains("user"))
|
||||||
|
return -1;
|
||||||
|
if (rProv.toLowerCase().contains("user"))
|
||||||
|
return 1;
|
||||||
|
if (lProv.toLowerCase().contains("propagation"))
|
||||||
|
return -1;
|
||||||
|
if (rProv.toLowerCase().contains("propagation"))
|
||||||
|
return 1;
|
||||||
|
if (lProv.toLowerCase().contains("iis"))
|
||||||
|
return -1;
|
||||||
|
if (rProv.toLowerCase().contains("iis"))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -260,14 +260,16 @@ public class GraphCleaningFunctionsTest {
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
|
||||||
assertNotNull(fos_subjects);
|
assertNotNull(fos_subjects);
|
||||||
assertEquals(3, fos_subjects.size());
|
assertEquals(2, fos_subjects.size());
|
||||||
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
fos_subjects
|
fos_subjects
|
||||||
.stream()
|
.stream()
|
||||||
.anyMatch(
|
.anyMatch(
|
||||||
s -> "0101 mathematics".equals(s.getValue()) &
|
s -> "0101 mathematics".equals(s.getValue()) &
|
||||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||||
|
"sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid())
|
||||||
|
));
|
||||||
|
|
||||||
assertTrue(
|
assertTrue(
|
||||||
fos_subjects
|
fos_subjects
|
||||||
|
|
|
@ -794,6 +794,28 @@
|
||||||
},
|
},
|
||||||
"value": "0101 mathematics"
|
"value": "0101 mathematics"
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"dataInfo": {
|
||||||
|
"deletedbyinference": false,
|
||||||
|
"inferenceprovenance": "",
|
||||||
|
"inferred": false,
|
||||||
|
"invisible": false,
|
||||||
|
"provenanceaction": {
|
||||||
|
"classid": "subject:fos",
|
||||||
|
"classname": "subject:fos",
|
||||||
|
"schemeid": "dnet:provenanceActions",
|
||||||
|
"schemename": "dnet:provenanceActions"
|
||||||
|
},
|
||||||
|
"trust": "0.9"
|
||||||
|
},
|
||||||
|
"qualifier": {
|
||||||
|
"classid": "keyword",
|
||||||
|
"classname": "keyword",
|
||||||
|
"schemeid": "dnet:subject_classification_typologies",
|
||||||
|
"schemename": "dnet:subject_classification_typologies"
|
||||||
|
},
|
||||||
|
"value": "0101 mathematics"
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"dataInfo": {
|
"dataInfo": {
|
||||||
"deletedbyinference": false,
|
"deletedbyinference": false,
|
||||||
|
|
Loading…
Reference in New Issue