forked from antonis.lempesis/dnet-hadoop
Merge pull request '[graph cleaning] added cleaning for result.publisher and result.instance.license' (#366) from clean_license_publisher into beta
Reviewed-on: D-Net/dnet-hadoop#366
This commit is contained in:
commit
2877839df0
|
@ -16,6 +16,7 @@ import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyTerm;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||||
|
@ -30,6 +31,10 @@ import me.xuender.unidecode.Unidecode;
|
||||||
|
|
||||||
public class GraphCleaningFunctions extends CleaningFunctions {
|
public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
|
|
||||||
|
public static final String DNET_PUBLISHERS = "dnet:publishers";
|
||||||
|
|
||||||
|
public static final String DNET_LICENSES = "dnet:licenses";
|
||||||
|
|
||||||
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
public static final String ORCID_CLEANING_REGEX = ".*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9]{4}).*[-–—−=].*([0-9x]{4})";
|
||||||
public static final int ORCID_LEN = 19;
|
public static final int ORCID_LEN = 19;
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
||||||
|
@ -409,6 +414,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
.getPublisher()
|
.getPublisher()
|
||||||
.getValue()
|
.getValue()
|
||||||
.replaceAll(NAME_CLEANING_REGEX, " "));
|
.replaceAll(NAME_CLEANING_REGEX, " "));
|
||||||
|
|
||||||
|
if (vocs.vocabularyExists(DNET_PUBLISHERS)) {
|
||||||
|
vocs.find(DNET_PUBLISHERS)
|
||||||
|
.map(voc -> voc.getTermBySynonym(r.getPublisher().getValue()))
|
||||||
|
.map(VocabularyTerm::getName)
|
||||||
|
.ifPresent(publisher -> r.getPublisher().setValue(publisher));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
||||||
|
@ -569,6 +581,13 @@ public class GraphCleaningFunctions extends CleaningFunctions {
|
||||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (Objects.nonNull(i.getLicense()) && Objects.nonNull(i.getLicense().getValue())) {
|
||||||
|
vocs.find(DNET_LICENSES)
|
||||||
|
.map(voc -> voc.getTermBySynonym(i.getLicense().getValue()))
|
||||||
|
.map(VocabularyTerm::getId)
|
||||||
|
.ifPresent(license -> i.getLicense().setValue(license));
|
||||||
|
}
|
||||||
|
|
||||||
// from the script from Dimitris
|
// from the script from Dimitris
|
||||||
if ("0000".equals(i.getRefereed().getClassid())) {
|
if ("0000".equals(i.getRefereed().getClassid())) {
|
||||||
final boolean isFromCrossref = Optional
|
final boolean isFromCrossref = Optional
|
||||||
|
|
Loading…
Reference in New Issue