WIP: cleaning of subjects

This commit is contained in:
Claudio Atzori 2022-08-05 12:32:08 +02:00
parent 6c0fd9284b
commit 32cee1f619
7 changed files with 37 additions and 21 deletions

View File

@ -85,8 +85,8 @@ public class Vocabulary implements Serializable {
public Qualifier lookup(String id) { public Qualifier lookup(String id) {
return Optional return Optional
.ofNullable(getSynonymAsQualifier(id)) .ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id)); .orElse(getTermAsQualifier(id));
} }
} }

View File

@ -83,9 +83,9 @@ public class VocabularyGroup implements Serializable {
public Optional<Vocabulary> find(final String vocId) { public Optional<Vocabulary> find(final String vocId) {
return Optional return Optional
.ofNullable(vocId) .ofNullable(vocId)
.map(String::toLowerCase) .map(String::toLowerCase)
.map(vocs::get); .map(vocs::get);
} }
public void addTerm(final String vocId, final String id, final String name) { public void addTerm(final String vocId, final String id, final String name) {

View File

@ -4,15 +4,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary; import javax.jws.WebParam;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import javax.jws.WebParam;
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable { public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {

View File

@ -251,6 +251,17 @@ public class GraphCleaningFunctionsTest {
pid.getQualifier().getClassname())); pid.getQualifier().getClassname()));
}); });
assertNotNull(p_cleaned.getSubject());
List<Subject> fos_subjects = p_cleaned
.getSubject()
.stream()
.filter(s -> ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid()))
.collect(Collectors.toList());
assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size());
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned)); System.out.println(MAPPER.writeValueAsString(p_cleaned));
} }

View File

@ -743,12 +743,12 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "", "classid": "keyword",
"classname": "", "classname": "keyword",
"schemeid": "", "schemeid": "dnet:subject_classification_typologies",
"schemename": "" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "infrared detectors" "value": "FOS: Mathematics"
}, },
{ {
"dataInfo": { "dataInfo": {
@ -765,12 +765,12 @@
"trust": "0.9" "trust": "0.9"
}, },
"qualifier": { "qualifier": {
"classid": "", "classid": "keyword",
"classname": "", "classname": "keyword",
"schemeid": "", "schemeid": "dnet:subject_classification_typologies",
"schemename": "" "schemename": "dnet:subject_classification_typologies"
}, },
"value": "lens antennas" "value": "FOS: Computer and information sciences"
}, },
{ {
"dataInfo": { "dataInfo": {

View File

@ -1243,4 +1243,6 @@ dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo
dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
dnet:relation_subRelType @=@ relationship @=@ publicationDataset dnet:relation_subRelType @=@ relationship @=@ publicationDataset
dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned dnet:provenanceActions @=@ iis @=@ erroneous label to be cleaned
FOS @=@ 0101 mathematics @=@ FOS: Mathematics
FOS @=@ 0102 computer and information sciences @=@ FOS: Computer and information sciences

View File

@ -1117,4 +1117,7 @@ dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relat
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version
FOS @=@ Fields of Science and Technology classification @=@ 0101 mathematics @=@ 0101 mathematics
FOS @=@ Fields of Science and Technology classification @=@ 0102 computer and information sciences @=@ 0102 computer and information sciences
FOS @=@ Fields of Science and Technology classification @=@ 0103 physical sciences @=@ 0103 physical sciences