Merge pull request '[graph cleaning] added cleaning for result.publisher and result.instance.license' (#366) from clean_license_publisher into beta

Reviewed-on: D-Net/dnet-hadoop#366
This commit is contained in:
Claudio Atzori 2023-12-08 16:58:37 +01:00
commit 2877839df0
1 changed files with 19 additions and 0 deletions

View File

@ -16,6 +16,7 @@ import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import com.github.sisyphsu.dateparser.DateParserUtils; import com.github.sisyphsu.dateparser.DateParserUtils;
@ -30,6 +31,10 @@ import me.xuender.unidecode.Unidecode;
public class GraphCleaningFunctions extends CleaningFunctions { public class GraphCleaningFunctions extends CleaningFunctions {
public static final String DNET_PUBLISHERS = "dnet:publishers";
public static final String DNET_LICENSES = "dnet:licenses";
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})"; public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
public static final int ORCID_LEN = 19; public static final int ORCID_LEN = 19;
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)"; public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
@ -409,6 +414,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
.getPublisher() .getPublisher()
.getValue() .getValue()
.replaceAll(NAME_CLEANING_REGEX, " ")); .replaceAll(NAME_CLEANING_REGEX, " "));
if (vocs.vocabularyExists(DNET_PUBLISHERS)) {
vocs.find(DNET_PUBLISHERS)
.map(voc -> voc.getTermBySynonym(r.getPublisher().getValue()))
.map(VocabularyTerm::getName)
.ifPresent(publisher -> r.getPublisher().setValue(publisher));
}
} }
} }
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
@ -569,6 +581,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
} }
if (Objects.nonNull(i.getLicense()) && Objects.nonNull(i.getLicense().getValue())) {
vocs.find(DNET_LICENSES)
.map(voc -> voc.getTermBySynonym(i.getLicense().getValue()))
.map(VocabularyTerm::getId)
.ifPresent(license -> i.getLicense().setValue(license));
}
// from the script from Dimitris // from the script from Dimitris
if ("0000".equals(i.getRefereed().getClassid())) { if ("0000".equals(i.getRefereed().getClassid())) {
final boolean isFromCrossref = Optional final boolean isFromCrossref = Optional