forked from D-Net/dnet-hadoop
cleaning of subjects: avoid duplicated subjects, prioritise collected vs inferred or other sources
This commit is contained in:
parent
adb526b0e1
commit
b7c387c21f
|
@ -1,6 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
|
||||
import java.time.LocalDate;
|
||||
import java.time.ZoneId;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
|
@ -16,7 +18,6 @@ import com.github.sisyphsu.dateparser.DateParserUtils;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -191,8 +192,8 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
||||
}
|
||||
if (Objects.nonNull(r.getSubject())) {
|
||||
r
|
||||
.setSubject(
|
||||
List<Subject> subjects = Lists
|
||||
.newArrayList(
|
||||
r
|
||||
.getSubject()
|
||||
.stream()
|
||||
|
@ -201,7 +202,18 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||
.map(GraphCleaningFunctions::cleanValue)
|
||||
.collect(Collectors.toList()));
|
||||
.collect(
|
||||
Collectors
|
||||
.toMap(
|
||||
s -> Optional
|
||||
.ofNullable(s.getQualifier())
|
||||
.map(q -> q.getClassid() + s.getValue())
|
||||
.orElse(s.getValue()),
|
||||
Function.identity(),
|
||||
(s1, s2) -> Collections
|
||||
.min(Lists.newArrayList(s1, s1), new SubjectProvenanceComparator())))
|
||||
.values());
|
||||
r.setSubject(subjects);
|
||||
}
|
||||
if (Objects.nonNull(r.getTitle())) {
|
||||
r
|
||||
|
@ -382,14 +394,7 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
|||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
||||
.map(p -> {
|
||||
// hack to distinguish orcid from orcid_pending
|
||||
String pidProvenance = Optional
|
||||
.ofNullable(p.getDataInfo())
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
String pidProvenance = getProvenance(p.getDataInfo());
|
||||
if (p
|
||||
.getQualifier()
|
||||
.getClassid()
|
||||
|
|
|
@ -11,10 +11,10 @@ import java.util.function.Function;
|
|||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
|
@ -503,4 +503,15 @@ public class OafMapperUtils {
|
|||
rel.setProperties(properties);
|
||||
return rel;
|
||||
}
|
||||
|
||||
public static String getProvenance(DataInfo dataInfo) {
|
||||
return Optional
|
||||
.ofNullable(dataInfo)
|
||||
.map(
|
||||
d -> Optional
|
||||
.ofNullable(d.getProvenanceaction())
|
||||
.map(Qualifier::getClassid)
|
||||
.orElse(""))
|
||||
.orElse("");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,46 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.getProvenance;
|
||||
import static org.apache.commons.lang3.StringUtils.isBlank;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Subject;
|
||||
|
||||
public class SubjectProvenanceComparator implements Comparator<Subject> {
|
||||
|
||||
@Override
|
||||
public int compare(Subject left, Subject right) {
|
||||
|
||||
String lProv = getProvenance(left.getDataInfo());
|
||||
String rProv = getProvenance(right.getDataInfo());
|
||||
|
||||
if (isBlank(lProv) && isBlank(rProv))
|
||||
return 0;
|
||||
if (isBlank(lProv))
|
||||
return 1;
|
||||
if (isBlank(rProv))
|
||||
return -1;
|
||||
if (lProv.equals(rProv))
|
||||
return 0;
|
||||
if (lProv.toLowerCase().contains("crosswalk"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("crosswalk"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("user"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("user"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("propagation"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("propagation"))
|
||||
return 1;
|
||||
if (lProv.toLowerCase().contains("iis"))
|
||||
return -1;
|
||||
if (rProv.toLowerCase().contains("iis"))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -260,14 +260,16 @@ public class GraphCleaningFunctionsTest {
|
|||
.collect(Collectors.toList());
|
||||
|
||||
assertNotNull(fos_subjects);
|
||||
assertEquals(3, fos_subjects.size());
|
||||
assertEquals(2, fos_subjects.size());
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
.stream()
|
||||
.anyMatch(
|
||||
s -> "0101 mathematics".equals(s.getValue()) &
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
|
||||
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()) &
|
||||
"sysimport:crosswalk:datasetarchive".equals(s.getDataInfo().getProvenanceaction().getClassid())
|
||||
));
|
||||
|
||||
assertTrue(
|
||||
fos_subjects
|
||||
|
|
|
@ -794,6 +794,28 @@
|
|||
},
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
"inferenceprovenance": "",
|
||||
"inferred": false,
|
||||
"invisible": false,
|
||||
"provenanceaction": {
|
||||
"classid": "subject:fos",
|
||||
"classname": "subject:fos",
|
||||
"schemeid": "dnet:provenanceActions",
|
||||
"schemename": "dnet:provenanceActions"
|
||||
},
|
||||
"trust": "0.9"
|
||||
},
|
||||
"qualifier": {
|
||||
"classid": "keyword",
|
||||
"classname": "keyword",
|
||||
"schemeid": "dnet:subject_classification_typologies",
|
||||
"schemename": "dnet:subject_classification_typologies"
|
||||
},
|
||||
"value": "0101 mathematics"
|
||||
},
|
||||
{
|
||||
"dataInfo": {
|
||||
"deletedbyinference": false,
|
||||
|
|
Loading…
Reference in New Issue