From 1ba582de3c9625417032294cd9212400d4e12663 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Thu, 23 Nov 2023 16:27:19 +0100 Subject: [PATCH] [graph cleaning] added cleaning for result.publisher and result.instance.license --- .../oaf/utils/GraphCleaningFunctions.java | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 7a8acbd36..d39798506 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -14,6 +14,7 @@ import java.util.function.Function; import java.util.stream.Collectors; import java.util.stream.Stream; +import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm; import org.apache.commons.lang3.StringUtils; import com.github.sisyphsu.dateparser.DateParserUtils; @@ -28,6 +29,10 @@ import me.xuender.unidecode.Unidecode; public class GraphCleaningFunctions extends CleaningFunctions { + public static final String DNET_PUBLISHERS = "dnet:publishers"; + + public static final String DNET_LICENSES = "dnet:licenses"; + public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final int ORCID_LEN = 19; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; @@ -407,6 +412,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { .getPublisher() .getValue() .replaceAll(NAME_CLEANING_REGEX, " ")); + + if (vocs.vocabularyExists(DNET_PUBLISHERS)) { + vocs.find(DNET_PUBLISHERS) + .map(voc -> voc.getTermBySynonym(r.getPublisher().getValue())) + .map(VocabularyTerm::getName) + .ifPresent(publisher -> r.getPublisher().setValue(publisher)); + } } } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { @@ -567,6 +579,13 @@ public class GraphCleaningFunctions extends CleaningFunctions { i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); } + if (Objects.nonNull(i.getLicense()) && Objects.nonNull(i.getLicense().getValue())) { + vocs.find(DNET_LICENSES) + .map(voc -> voc.getTermBySynonym(i.getLicense().getValue())) + .map(VocabularyTerm::getId) + .ifPresent(license -> i.getLicense().setValue(license)); + } + // from the script from Dimitris if ("0000".equals(i.getRefereed().getClassid())) { final boolean isFromCrossref = Optional