From 1d33074fd1d05dc4d11bd976056a5a1001bf9b48 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Fri, 9 Jun 2023 16:47:25 +0200 Subject: [PATCH] WIP: pid cleaning --- .../dhp/schema/oaf/utils/DoiCleaningRule.java | 14 + .../schema/oaf/utils/FundRefCleaningRule.java | 23 + .../oaf/utils/GraphCleaningFunctions.java | 717 +++++++++--------- .../schema/oaf/utils/GridCleaningRule.java | 18 + .../schema/oaf/utils/ISNICleaningRule.java | 19 + .../dhp/schema/oaf/utils/PICCleaningRule.java | 19 + .../dhp/schema/oaf/utils/PidCleaner.java | 62 ++ .../dhp/schema/oaf/utils/PmcCleaningRule.java | 13 + .../schema/oaf/utils/PmidCleaningRule.java | 16 + .../dhp/schema/oaf/utils/RorCleaningRule.java | 18 + 10 files changed, 573 insertions(+), 346 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/DoiCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/DoiCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/DoiCleaningRule.java new file mode 100644 index 000000000..1a7482685 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/DoiCleaningRule.java @@ -0,0 +1,14 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +public class DoiCleaningRule { + + public static String clean(final String doi) { + return doi + .toLowerCase() + .replaceAll("\\s", "") + .replaceAll("^doi:", "") + .replaceFirst(CleaningFunctions.DOI_PREFIX_REGEX, CleaningFunctions.DOI_PREFIX); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java new file mode 100644 index 000000000..7f6303825 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/FundRefCleaningRule.java @@ -0,0 +1,23 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class FundRefCleaningRule { + + public static String clean(final String fundrefId) { + + String s = fundrefId + .toLowerCase() + .replaceAll("\\s", ""); + + Matcher m = Pattern.compile("\\d+").matcher(s); + if (m.matches()) { + return m.group(); + } else { + return ""; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index 1aee72f09..a47b63edb 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -162,52 +162,66 @@ public class GraphCleaningFunctions extends CleaningFunctions { } public static T fixVocabularyNames(T value) { - if (value instanceof Datasource) { - // nothing to clean here - } else if (value instanceof Project) { - // nothing to clean here - } else if (value instanceof Organization) { - Organization o = (Organization) value; - if (Objects.nonNull(o.getCountry())) { - fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); + if (value instanceof OafEntity) { + + OafEntity e = (OafEntity) value; + + Optional + .ofNullable(e.getPid()) + .ifPresent(pid -> pid.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES))); + + if (value instanceof Result) { + Result r = (Result) value; + + fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); + fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); + fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); + + if (Objects.nonNull(r.getSubject())) { + r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); + } + if (Objects.nonNull(r.getInstance())) { + for (Instance i : r.getInstance()) { + fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); + fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); + Optional + .ofNullable(i.getPid()) + .ifPresent( + pid -> pid.forEach(p -> fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES))); + + } + } + if (Objects.nonNull(r.getAuthor())) { + r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> { + if (Objects.nonNull(a.getPid())) { + a.getPid().stream().filter(Objects::nonNull).forEach(p -> { + fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); + }); + } + }); + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + } else if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.nonNull(o.getCountry())) { + fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE); + } + } } else if (value instanceof Relation) { // nothing to clean here - } else if (value instanceof Result) { - - Result r = (Result) value; - - fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES); - fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE); - fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES); - - if (Objects.nonNull(r.getSubject())) { - r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES)); - } - if (Objects.nonNull(r.getInstance())) { - for (Instance i : r.getInstance()) { - fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES); - fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS); - } - } - if (Objects.nonNull(r.getAuthor())) { - r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> { - if (Objects.nonNull(a.getPid())) { - a.getPid().stream().filter(Objects::nonNull).forEach(p -> { - fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES); - }); - } - }); - } - if (value instanceof Publication) { - - } else if (value instanceof Dataset) { - - } else if (value instanceof OtherResearchProduct) { - - } else if (value instanceof Software) { - - } } return value; @@ -260,15 +274,320 @@ public class GraphCleaningFunctions extends CleaningFunctions { } public static T cleanup(T value, VocabularyGroup vocs) { - if (value instanceof Datasource) { - // nothing to clean here - } else if (value instanceof Project) { - // nothing to clean here - } else if (value instanceof Organization) { - Organization o = (Organization) value; - if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { - o.setCountry(ModelConstants.UNKNOWN_COUNTRY); + + if (value instanceof OafEntity) { + + OafEntity e = (OafEntity) value; + if (Objects.nonNull(e.getPid())) { + e.setPid(processPidCleaning(e.getPid())); } + + if (value instanceof Datasource) { + // nothing to clean here + } else if (value instanceof Project) { + // nothing to clean here + } else if (value instanceof Organization) { + Organization o = (Organization) value; + if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) { + o.setCountry(ModelConstants.UNKNOWN_COUNTRY); + } + } else if (value instanceof Result) { + Result r = (Result) value; + + if (Objects.nonNull(r.getDateofacceptance())) { + Optional date = cleanDateField(r.getDateofacceptance()); + if (date.isPresent()) { + r.getDateofacceptance().setValue(date.get()); + } else { + r.setDateofacceptance(null); + } + } + if (Objects.nonNull(r.getRelevantdate())) { + r + .setRelevantdate( + r + .getRelevantdate() + .stream() + .filter(Objects::nonNull) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(sp -> { + sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue())); + return sp; + }) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { + r.setPublisher(null); + } + if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { + r + .setLanguage( + qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); + } + if (Objects.nonNull(r.getSubject())) { + List subjects = Lists + .newArrayList( + r + .getSubject() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(sp -> Objects.nonNull(sp.getQualifier())) + .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) + .map(s -> { + if ("dnet:result_subject".equals(s.getQualifier().getClassid())) { + s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES); + s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES); + } + return s; + }) + .map(GraphCleaningFunctions::cleanValue) + .collect( + Collectors + .toMap( + s -> Optional + .ofNullable(s.getQualifier()) + .map(q -> q.getClassid() + s.getValue()) + .orElse(s.getValue()), + Function.identity(), + (s1, s2) -> Collections + .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) + .values()); + r.setSubject(subjects); + } + if (Objects.nonNull(r.getTitle())) { + r + .setTitle( + r + .getTitle() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter( + sp -> { + final String title = sp + .getValue() + .toLowerCase(); + final String decoded = Unidecode.decode(title); + + if (StringUtils.contains(decoded, TITLE_TEST)) { + return decoded + .replaceAll(TITLE_FILTER_REGEX, "") + .length() > TITLE_FILTER_RESIDUAL_LENGTH; + } + return !decoded + .replaceAll("\\W|\\d", "") + .isEmpty(); + }) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getFormat())) { + r + .setFormat( + r + .getFormat() + .stream() + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.nonNull(r.getDescription())) { + r + .setDescription( + r + .getDescription() + .stream() + .filter(Objects::nonNull) + .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .map(GraphCleaningFunctions::cleanValue) + .collect(Collectors.toList())); + } + if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { + r + .setResourcetype( + qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); + } + if (Objects.nonNull(r.getInstance())) { + + for (Instance i : r.getInstance()) { + if (!vocs + .termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) { + if (r instanceof Publication) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0038", "Other literature type", + ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Dataset) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof Software) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } else if (r instanceof OtherResearchProduct) { + i + .setInstancetype( + OafMapperUtils + .qualifier( + "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, + ModelConstants.DNET_PUBLICATION_RESOURCE)); + } + } + + if (Objects.nonNull(i.getPid())) { + i.setPid(processPidCleaning(i.getPid())); + } + if (Objects.nonNull(i.getAlternateIdentifier())) { + i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier())); + } + Optional + .ofNullable(i.getPid()) + .ifPresent(pid -> { + final Set pids = Sets.newHashSet(pid); + Optional + .ofNullable(i.getAlternateIdentifier()) + .ifPresent(altId -> { + final Set altIds = Sets.newHashSet(altId); + i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); + }); + }); + + if (Objects.isNull(i.getAccessright()) + || StringUtils.isBlank(i.getAccessright().getClassid())) { + i + .setAccessright( + accessRight( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } + if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { + i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); + } + if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) { + i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); + } + if (Objects.nonNull(i.getDateofacceptance())) { + Optional date = cleanDateField(i.getDateofacceptance()); + if (date.isPresent()) { + i.getDateofacceptance().setValue(date.get()); + } else { + i.setDateofacceptance(null); + } + } + } + } + if (Objects.isNull(r.getBestaccessright()) + || StringUtils.isBlank(r.getBestaccessright().getClassid())) { + Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance()); + if (Objects.isNull(bestaccessrights)) { + r + .setBestaccessright( + qualifier( + ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, + ModelConstants.DNET_ACCESS_MODES)); + } else { + r.setBestaccessright(bestaccessrights); + } + } + if (Objects.nonNull(r.getAuthor())) { + r + .setAuthor( + r + .getAuthor() + .stream() + .filter(Objects::nonNull) + .filter(a -> StringUtils.isNotBlank(a.getFullname())) + .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) + .collect(Collectors.toList())); + + boolean nullRank = r + .getAuthor() + .stream() + .anyMatch(a -> Objects.isNull(a.getRank())); + if (nullRank) { + int i = 1; + for (Author author : r.getAuthor()) { + author.setRank(i++); + } + } + + for (Author a : r.getAuthor()) { + if (Objects.isNull(a.getPid())) { + a.setPid(Lists.newArrayList()); + } else { + a + .setPid( + a + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(p -> Objects.nonNull(p.getQualifier())) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .map(p -> { + // hack to distinguish orcid from orcid_pending + String pidProvenance = getProvenance(p.getDataInfo()); + if (p + .getQualifier() + .getClassid() + .toLowerCase() + .contains(ModelConstants.ORCID)) { + if (pidProvenance + .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { + p.getQualifier().setClassid(ModelConstants.ORCID); + } else { + p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); + } + final String orcid = p + .getValue() + .trim() + .toLowerCase() + .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); + if (orcid.length() == ORCID_LEN) { + p.setValue(orcid); + } else { + p.setValue(""); + } + } + return p; + }) + .filter(p -> StringUtils.isNotBlank(p.getValue())) + .collect( + Collectors + .toMap( + p -> p.getQualifier().getClassid() + p.getValue(), + Function.identity(), + (p1, p2) -> p1, + LinkedHashMap::new)) + .values() + .stream() + .collect(Collectors.toList())); + } + } + } + if (value instanceof Publication) { + + } else if (value instanceof Dataset) { + + } else if (value instanceof OtherResearchProduct) { + + } else if (value instanceof Software) { + + } + + } + } else if (value instanceof Relation) { Relation r = (Relation) value; @@ -280,300 +599,6 @@ public class GraphCleaningFunctions extends CleaningFunctions { r.setValidationDate(null); r.setValidated(false); } - } else if (value instanceof Result) { - - Result r = (Result) value; - - if (Objects.nonNull(r.getDateofacceptance())) { - Optional date = cleanDateField(r.getDateofacceptance()); - if (date.isPresent()) { - r.getDateofacceptance().setValue(date.get()); - } else { - r.setDateofacceptance(null); - } - } - if (Objects.nonNull(r.getRelevantdate())) { - r - .setRelevantdate( - r - .getRelevantdate() - .stream() - .filter(Objects::nonNull) - .filter(sp -> Objects.nonNull(sp.getQualifier())) - .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(sp -> { - sp.setValue(GraphCleaningFunctions.cleanDate(sp.getValue())); - return sp; - }) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .collect(Collectors.toList())); - } - if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { - r.setPublisher(null); - } - if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { - r - .setLanguage( - qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES)); - } - if (Objects.nonNull(r.getSubject())) { - List subjects = Lists - .newArrayList( - r - .getSubject() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .filter(sp -> Objects.nonNull(sp.getQualifier())) - .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(s -> { - if ("dnet:result_subject".equals(s.getQualifier().getClassid())) { - s.getQualifier().setClassid(ModelConstants.DNET_SUBJECT_TYPOLOGIES); - s.getQualifier().setClassname(ModelConstants.DNET_SUBJECT_TYPOLOGIES); - } - return s; - }) - .map(GraphCleaningFunctions::cleanValue) - .collect( - Collectors - .toMap( - s -> Optional - .ofNullable(s.getQualifier()) - .map(q -> q.getClassid() + s.getValue()) - .orElse(s.getValue()), - Function.identity(), - (s1, s2) -> Collections - .min(Lists.newArrayList(s1, s2), new SubjectProvenanceComparator()))) - .values()); - r.setSubject(subjects); - } - if (Objects.nonNull(r.getTitle())) { - r - .setTitle( - r - .getTitle() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .filter( - sp -> { - final String title = sp - .getValue() - .toLowerCase(); - final String decoded = Unidecode.decode(title); - - if (StringUtils.contains(decoded, TITLE_TEST)) { - return decoded - .replaceAll(TITLE_FILTER_REGEX, "") - .length() > TITLE_FILTER_RESIDUAL_LENGTH; - } - return !decoded - .replaceAll("\\W|\\d", "") - .isEmpty(); - }) - .map(GraphCleaningFunctions::cleanValue) - .collect(Collectors.toList())); - } - if (Objects.nonNull(r.getFormat())) { - r - .setFormat( - r - .getFormat() - .stream() - .map(GraphCleaningFunctions::cleanValue) - .collect(Collectors.toList())); - } - if (Objects.nonNull(r.getDescription())) { - r - .setDescription( - r - .getDescription() - .stream() - .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) - .map(GraphCleaningFunctions::cleanValue) - .collect(Collectors.toList())); - } - if (Objects.nonNull(r.getPid())) { - r.setPid(processPidCleaning(r.getPid())); - } - if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { - r - .setResourcetype( - qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE)); - } - if (Objects.nonNull(r.getInstance())) { - - for (Instance i : r.getInstance()) { - if (!vocs.termExists(ModelConstants.DNET_PUBLICATION_RESOURCE, i.getInstancetype().getClassid())) { - if (r instanceof Publication) { - i - .setInstancetype( - OafMapperUtils - .qualifier( - "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); - } else if (r instanceof Dataset) { - i - .setInstancetype( - OafMapperUtils - .qualifier( - "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); - } else if (r instanceof Software) { - i - .setInstancetype( - OafMapperUtils - .qualifier( - "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); - } else if (r instanceof OtherResearchProduct) { - i - .setInstancetype( - OafMapperUtils - .qualifier( - "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); - } - } - - if (Objects.nonNull(i.getPid())) { - i.setPid(processPidCleaning(i.getPid())); - } - if (Objects.nonNull(i.getAlternateIdentifier())) { - i.setAlternateIdentifier(processPidCleaning(i.getAlternateIdentifier())); - } - Optional - .ofNullable(i.getPid()) - .ifPresent(pid -> { - final Set pids = Sets.newHashSet(pid); - Optional - .ofNullable(i.getAlternateIdentifier()) - .ifPresent(altId -> { - final Set altIds = Sets.newHashSet(altId); - i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids))); - }); - }); - - if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) { - i - .setAccessright( - accessRight( - ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, - ModelConstants.DNET_ACCESS_MODES)); - } - if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) { - i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY); - } - if (Objects.isNull(i.getRefereed()) || StringUtils.isBlank(i.getRefereed().getClassid())) { - i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS)); - } - if (Objects.nonNull(i.getDateofacceptance())) { - Optional date = cleanDateField(i.getDateofacceptance()); - if (date.isPresent()) { - i.getDateofacceptance().setValue(date.get()); - } else { - i.setDateofacceptance(null); - } - } - } - } - if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { - Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance()); - if (Objects.isNull(bestaccessrights)) { - r - .setBestaccessright( - qualifier( - ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE, - ModelConstants.DNET_ACCESS_MODES)); - } else { - r.setBestaccessright(bestaccessrights); - } - } - if (Objects.nonNull(r.getAuthor())) { - r - .setAuthor( - r - .getAuthor() - .stream() - .filter(Objects::nonNull) - .filter(a -> StringUtils.isNotBlank(a.getFullname())) - .filter(a -> StringUtils.isNotBlank(a.getFullname().replaceAll("[\\W]", ""))) - .collect(Collectors.toList())); - - boolean nullRank = r - .getAuthor() - .stream() - .anyMatch(a -> Objects.isNull(a.getRank())); - if (nullRank) { - int i = 1; - for (Author author : r.getAuthor()) { - author.setRank(i++); - } - } - - for (Author a : r.getAuthor()) { - if (Objects.isNull(a.getPid())) { - a.setPid(Lists.newArrayList()); - } else { - a - .setPid( - a - .getPid() - .stream() - .filter(Objects::nonNull) - .filter(p -> Objects.nonNull(p.getQualifier())) - .filter(p -> StringUtils.isNotBlank(p.getValue())) - .map(p -> { - // hack to distinguish orcid from orcid_pending - String pidProvenance = getProvenance(p.getDataInfo()); - if (p - .getQualifier() - .getClassid() - .toLowerCase() - .contains(ModelConstants.ORCID)) { - if (pidProvenance - .equals(ModelConstants.SYSIMPORT_CROSSWALK_ENTITYREGISTRY)) { - p.getQualifier().setClassid(ModelConstants.ORCID); - } else { - p.getQualifier().setClassid(ModelConstants.ORCID_PENDING); - } - final String orcid = p - .getValue() - .trim() - .toLowerCase() - .replaceAll(ORCID_CLEANING_REGEX, "$1-$2-$3-$4"); - if (orcid.length() == ORCID_LEN) { - p.setValue(orcid); - } else { - p.setValue(""); - } - } - return p; - }) - .filter(p -> StringUtils.isNotBlank(p.getValue())) - .collect( - Collectors - .toMap( - p -> p.getQualifier().getClassid() + p.getValue(), - Function.identity(), - (p1, p2) -> p1, - LinkedHashMap::new)) - .values() - .stream() - .collect(Collectors.toList())); - } - } - } - if (value instanceof Publication) { - - } else if (value instanceof Dataset) { - - } else if (value instanceof OtherResearchProduct) { - - } else if (value instanceof Software) { - - } } return value; @@ -628,7 +653,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase())) .filter(sp -> Objects.nonNull(sp.getQualifier())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) - .map(CleaningFunctions::normalizePidValue) + .map(PidCleaner::normalizePidValue) .filter(CleaningFunctions::pidFilter) .collect(Collectors.toList()); } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java new file mode 100644 index 000000000..ff45d6a0d --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GridCleaningRule.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class GridCleaningRule { + + public static String clean(String grid) { + String s = grid + .replaceAll("\\s", "") + .toLowerCase(); + + Matcher m = Pattern.compile("\\d{4,6}\\.[0-9a-z]{1,2}").matcher(s); + return m.matches() ? "grid." + m.group() : ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java new file mode 100644 index 000000000..5bc49c453 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ISNICleaningRule.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +// https://www.wikidata.org/wiki/Property:P213 +public class ISNICleaningRule { + + public static String clean(final String isni) { + + Matcher m = Pattern.compile("([0]{4}) ?([0-9]{4}) ?([0-9]{4}) ?([0-9]{3}[0-9X])").matcher(isni); + if (m.matches()) { + return String.join("", m.group(1), m.group(2), m.group(3), m.group(4)); + } else { + return ""; + } + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java new file mode 100644 index 000000000..83b9a1f9f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PICCleaningRule.java @@ -0,0 +1,19 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class PICCleaningRule { + + public static String clean(final String pic) { + + Matcher m = Pattern.compile("\\d{9}").matcher(pic); + if (m.matches()) { + return m.group(); + } else { + return ""; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java new file mode 100644 index 000000000..114c2b3af --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidCleaner.java @@ -0,0 +1,62 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Optional; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class PidCleaner { + + /** + * Utility method that normalises PID values on a per-type basis. + * @param pid the PID whose value will be normalised. + * @return the PID containing the normalised value. + */ + public static StructuredProperty normalizePidValue(StructuredProperty pid) { + pid + .setValue( + normalizePidValue( + pid.getQualifier().getClassid(), + pid.getValue())); + + return pid; + } + + public static String normalizePidValue(String pidType, String pidValue) { + String value = Optional + .ofNullable(pidValue) + .map(String::trim) + .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + + switch (pidType) { + + // TODO add cleaning for more PID types as needed + + // Result + case "doi": + return DoiCleaningRule.clean(value); + case "pmid": + return PmidCleaningRule.clean(value); + case "pmc": + return PmcCleaningRule.clean(value); + case "handle": + case "arXiv": + return value; + + // Organization + case "GRID": + return GridCleaningRule.clean(value); + case "ISNI": + return ISNICleaningRule.clean(value); + case "ROR": + return RorCleaningRule.clean(value); + case "PIC": + return PICCleaningRule.clean(value); + case "FundRef": + return FundRefCleaningRule.clean(value); + default: + return value; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java new file mode 100644 index 000000000..4e1205805 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmcCleaningRule.java @@ -0,0 +1,13 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +public class PmcCleaningRule { + + public static String clean(String pmc) { + String s = pmc + .replaceAll("\\s", "") + .toUpperCase(); + return s.matches("^PMC\\d{1,8}$") ? s : ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java new file mode 100644 index 000000000..65833a594 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PmidCleaningRule.java @@ -0,0 +1,16 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +// https://researchguides.stevens.edu/c.php?g=442331&p=6577176 +public class PmidCleaningRule { + + public static String clean(String pmid) { + String s = pmid + .toLowerCase() + .replaceAll("\\s", "") + .trim() + .replaceAll("^0+", ""); + return s.matches("^\\d{1,8}$") ? s : ""; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java new file mode 100644 index 000000000..f40cdb00c --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/RorCleaningRule.java @@ -0,0 +1,18 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +// https://ror.readme.io/docs/ror-identifier-pattern +public class RorCleaningRule { + + public static String clean(String ror) { + String s = ror + .replaceAll("\\s", "") + .toLowerCase(); + Matcher m = Pattern.compile("0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}").matcher(s); + return m.matches() ? "https://ror.org/" + m.group() : ""; + } + +}