subjects cleaning #239

Merged
claudio.atzori merged 15 commits from clean_subjects into beta 2022-09-09 15:17:09 +02:00
3 changed files with 37 additions and 5 deletions
Showing only changes of commit b78889a0ce - Show all commits

View File

@ -83,4 +83,10 @@ public class Vocabulary implements Serializable {
.orElse(null);
}
public Qualifier lookup(String id) {
return Optional
.ofNullable(getSynonymAsQualifier(id))
.orElse(getTermAsQualifier(id));
}
}

View File

@ -81,6 +81,13 @@ public class VocabularyGroup implements Serializable {
vocs.put(id.toLowerCase(), new Vocabulary(id, name));
}
public Optional<Vocabulary> find(final String vocId) {
return Optional
.ofNullable(vocId)
.map(String::toLowerCase)
.map(vocs::get);
}
public void addTerm(final String vocId, final String id, final String name) {
if (vocabularyExists(vocId)) {
vocs.get(vocId.toLowerCase()).addTerm(id, name);

View File

@ -4,15 +4,15 @@ package eu.dnetlib.dhp.oa.graph.clean;
import java.io.Serializable;
import java.util.HashMap;
import eu.dnetlib.dhp.common.vocabulary.Vocabulary;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.SerializableConsumer;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import javax.jws.WebParam;
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
@ -27,10 +27,29 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
mapping.put(Subject.class, o -> cleanSubject(vocabularies, (Subject) o));
return mapping;
}
private static void cleanSubject(VocabularyGroup vocabularies, Subject s) {
// TODO cleaning based on different subject vocabs can be added here
cleanSubjectForVocabulary(ModelConstants.DNET_SUBJECT_FOS_CLASSID, vocabularies, s);
}
private static void cleanSubjectForVocabulary(String vocabularyId, VocabularyGroup vocabularies, Subject s) {
vocabularies.find(vocabularyId).ifPresent(vocabulary -> {
if (!ModelConstants.DNET_SUBJECT_KEYWORD.equalsIgnoreCase(s.getQualifier().getClassid())) {
return;
}
Qualifier newValue = vocabulary.lookup(s.getValue());
if (!s.getValue().equals(newValue.getClassid())) {
s.setValue(newValue.getClassid());
s.getQualifier().setClassid(vocabularyId);
s.getQualifier().setClassname(vocabulary.getName());
}

I do not understand why you compare subject.value and newValue.classid

I do not understand why you compare subject.value and newValue.classid

I saw by myself. It is ok.

One thing: you will not change the classId of the subject if the value provided is equal to the term in the vocabulary.

I saw by myself. It is ok. One thing: you will not change the classId of the subject if the value provided is equal to the term in the vocabulary.

You are right. However, the vocabulary.lookup method already tryes to find match a synonym and in case it can't then it looks for a matching equivalent term:

public Qualifier lookup(String id) {
	return Optional
		.ofNullable(getSynonymAsQualifier(id))
		.orElse(getTermAsQualifier(id));
}

Then, in case a matching term cannot be found, the method returns a qualifier set to UNKNOWN

public Qualifier getTermAsQualifier(final String termId) {
	if (StringUtils.isBlank(termId)) {
		return OafMapperUtils.unknown(getId(), getName());
	} else if (termExists(termId)) {
		final VocabularyTerm t = getTerm(termId);
		return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName());
	} else {
		return OafMapperUtils.qualifier(termId, termId, getId(), getName());
	}
}

hence I could just exploit this to decide what to do when the value provided is equal to the term in the vocabulary.

You are right. However, the `vocabulary.lookup` method already tryes to find match a synonym and in case it can't then it looks for a matching equivalent term: ``` public Qualifier lookup(String id) { return Optional .ofNullable(getSynonymAsQualifier(id)) .orElse(getTermAsQualifier(id)); } ``` Then, in case a matching term cannot be found, the method returns a qualifier set to UNKNOWN ``` public Qualifier getTermAsQualifier(final String termId) { if (StringUtils.isBlank(termId)) { return OafMapperUtils.unknown(getId(), getName()); } else if (termExists(termId)) { final VocabularyTerm t = getTerm(termId); return OafMapperUtils.qualifier(t.getId(), t.getName(), getId(), getName()); } else { return OafMapperUtils.qualifier(termId, termId, getId(), getName()); } } ``` hence I could just exploit this to decide what to do when the value provided is equal to the term in the vocabulary.

Yes, I do agree

Yes, I do agree
});
}
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());