cleaning of subjects

This commit is contained in:
Claudio Atzori 2022-08-05 16:56:09 +02:00
parent 32cee1f619
commit 4eaa063b1f
2 changed files with 33 additions and 13 deletions

View File

@ -3,13 +3,12 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable; import java.io.Serializable;
import java.util.HashMap; import java.util.HashMap;
import java.util.concurrent.atomic.AtomicReference;
import javax.jws.WebParam; import org.apache.commons.lang3.SerializationUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer; import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
@ -31,23 +30,30 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
return mapping; return mapping;
} }
private static void cleanSubject(VocabularyGroup vocabularies, Subject s) { private static void cleanSubject(VocabularyGroup vocabularies, Subject subject) {
// TODO cleaning based on different subject vocabs can be added here if (cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, subject)) {
cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, s); return;
} else {
// TODO cleaning based on different subject vocabs can be added here
}
} }
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies, Subject s) { private static boolean cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies,
Subject subject) {
AtomicReference<Boolean> modified = new AtomicReference<>(false);
vocabularies.find(vocabularyId).ifPresent(vocabulary -> { vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(s.getQualifier().getClassid())) { if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(subject.getQualifier().getClassid())) {
return; return;
} }
Qualifier newValue = vocabulary.lookup(s.getValue()); Qualifier newValue = vocabulary.lookup(subject.getValue());
if (!s.getValue().equals(newValue.getClassid())) { if (!subject.getValue().equals(newValue.getClassid())) {
s.setValue(newValue.getClassid()); subject.setValue(newValue.getClassid());
s.getQualifier().setClassid(vocabularyId); subject.getQualifier().setClassid(vocabularyId);
s.getQualifier().setClassname(vocabulary.getName()); subject.getQualifier().setClassname(vocabulary.getName());
modified.set(true);
} }
}); });
return modified.get();
} }
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {

View File

@ -262,6 +262,20 @@ public class GraphCleaningFunctionsTest {
assertNotNull(fos_subjects); assertNotNull(fos_subjects);
assertEquals(2, fos_subjects.size()); assertEquals(2, fos_subjects.size());
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0101 mathematics".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
assertTrue(
fos_subjects
.stream()
.anyMatch(
s -> "0102 computer and information sciences".equals(s.getValue()) &
ModelConstants.DNET_SUBJECT_FOS_CLASSID.equals(s.getQualifier().getClassid())));
// TODO add more assertions to verity the cleaned values // TODO add more assertions to verity the cleaned values
System.out.println(MAPPER.writeValueAsString(p_cleaned)); System.out.println(MAPPER.writeValueAsString(p_cleaned));
} }