diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index c25d67f257..2c1bc48e36 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -16,6 +16,7 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import org.apache.commons.lang3.StringUtils; import com.github.sisyphsu.dateparser.DateParserUtils; @@ -30,6 +31,10 @@ import me.xuender.unidecode.Unidecode; public class GraphCleaningFunctions extends CleaningFunctions { + public static final String DNET_PUBLISHERS = "dnet:publishers"; + + public static final String DNET_LICENSES = "dnet:licenses"; + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; @@ -409,6 +414,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { .getPublisher() .getValue() .replaceAll(NAME_CLEANING_REGEX, " ")); + + if (vocs.vocabularyExists(DNET_PUBLISHERS)) { + vocs.find(DNET_PUBLISHERS) + .map(voc -> voc.getTermBySynonym(r.getPublisher().getValue())) + .map(VocabularyTerm::getName) + .ifPresent(publisher -> r.getPublisher().setValue(publisher)); + } } } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { @@ -569,6 +581,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); } + if (Objects.nonNull(i.getLicense()) && Objects.nonNull(i.getLicense().getValue())) { + vocs.find(DNET_LICENSES) + .map(voc -> voc.getTermBySynonym(i.getLicense().getValue())) + .map(VocabularyTerm::getId) + .ifPresent(license -> i.getLicense().setValue(license)); + } + // from the script from Dimitris if ("0000".equals(i.getRefereed().getClassid())) { final boolean isFromCrossref = Optional