From 1845dcfedf0ab11a887187c1a6487add2e2a4ef6 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 1 Feb 2023 16:24:35 +0100 Subject: [PATCH] WIP: refactoring the internal graph data model and its utilities --- .../dhp/common/vocabulary/Vocabulary.java | 6 +- .../common/vocabulary/VocabularyGroup.java | 6 +- .../eu/dnetlib/dhp/oa/merge/AuthorMerger.java | 5 +- .../oa/merge/DispatchEntitiesSparkJob.java | 4 +- .../dhp/oa/merge/GroupEntitiesSparkJob.java | 34 +- .../oaf/common/AccessRightComparator.java | 69 ++ .../dhp/schema/oaf/common/EntityType.java | 21 + .../dhp/schema/oaf/common/MainEntityType.java | 7 + .../dhp/schema/oaf/common/ModelSupport.java | 417 +++++++++++ .../schema/oaf/common/RefereedComparator.java | 45 ++ .../schema/oaf/common/RelationInverse.java | 46 ++ .../schema/oaf/utils/CleaningFunctions.java | 76 ++ .../oaf/utils/GraphCleaningFunctions.java | 98 +-- .../schema/oaf/utils/IdentifierFactory.java | 294 ++++++++ .../dhp/schema/oaf/utils/MergeBeanUtils.java | 104 +++ .../dhp/schema/oaf/utils/MergeUtils.java | 661 ++++++++++++++++++ .../dhp/schema/oaf/utils/MergeUtils2.java | 156 +++++ .../dhp/schema/oaf/utils/MergeUtils3.java | 89 +++ .../dhp/schema/oaf/utils/ModelHardLimits.java | 25 + .../dhp/schema/oaf/utils/OafMapperUtils.java | 180 +++-- .../oaf/utils/OrganizationPidComparator.java | 38 + .../dhp/schema/oaf/utils/PidBlacklist.java | 8 + .../oaf/utils/PidBlacklistProvider.java | 40 ++ .../dhp/schema/oaf/utils/PidComparator.java | 48 ++ .../dnetlib/dhp/schema/oaf/utils/PidType.java | 79 +++ .../schema/oaf/utils/PidValueComparator.java | 33 + .../schema/oaf/utils/ResultPidComparator.java | 53 ++ .../oaf/utils/ResultTypeComparator.java | 77 ++ .../dhp/sx/graph/scholix/ScholixUtils.scala | 17 +- .../schema/oaf/common/ModelSupportTest.java | 52 ++ .../oaf/utils/BlackListProviderTest.java | 21 + .../oaf/utils/IdentifierFactoryTest.java | 87 +++ .../schema/oaf/utils/OafMapperUtilsTest.java | 40 +- .../dhp/schema/oaf/utils/dataset_1.json | 29 +- .../dhp/schema/oaf/utils/dataset_2.json | 77 +- .../schema/oaf/utils/dataset_delegated.json | 77 +- .../dhp/schema/oaf/utils/orp-rohub.json | 197 ++++++ .../dhp/schema/oaf/utils/publication_1.json | 29 +- .../dhp/schema/oaf/utils/publication_2.json | 29 +- .../dhp/schema/oaf/utils/publication_3.json | 1 + .../dhp/schema/oaf/utils/publication_4.json | 1 + .../dhp/schema/oaf/utils/publication_5.json | 1 + .../schema/oaf/utils/publication_doi1.json | 33 + .../schema/oaf/utils/publication_doi2.json | 37 + .../schema/oaf/utils/publication_doi3.json | 37 + .../schema/oaf/utils/publication_doi4.json | 37 + .../schema/oaf/utils/publication_doi5.json | 37 + .../schema/oaf/utils/publication_openapc.json | 31 + .../schema/oaf/utils/publication_pmc1.json | 17 + .../schema/oaf/utils/publication_pmc2.json | 21 + .../schema/oaf/utils/publication_urn1.json | 23 + .../dhp/collection/CollectionUtils.scala | 7 +- .../dhp/datacite/DataciteModelConstants.scala | 12 +- .../DataciteToOAFTransformation.scala | 92 +-- .../ebi/SparkCreateBaselineDataFrame.scala | 3 +- .../dnetlib/dhp/sx/bio/BioScholixTest.scala | 32 +- .../dhp/oa/dedup/SparkCreateDedupRecord.java | 5 +- pom.xml | 2 +- 58 files changed, 3379 insertions(+), 424 deletions(-) create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/AccessRightComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/EntityType.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/MainEntityType.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupport.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RefereedComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RelationInverse.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/CleaningFunctions.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeBeanUtils.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils2.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils3.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklist.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklistProvider.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java create mode 100644 dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupportTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/BlackListProviderTest.java create mode 100644 dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/orp-rohub.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi3.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi4.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi5.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_openapc.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc2.json create mode 100644 dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java index 2ab23bda6..879a09481 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/Vocabulary.java @@ -73,11 +73,11 @@ public class Vocabulary implements Serializable { public Qualifier getTermAsQualifier(final String termId, boolean strict) { final VocabularyTerm term = getTerm(termId); if (Objects.nonNull(term)) { - return OafMapperUtils.qualifier(term.getId(), term.getName(), getId(), getName()); + return OafMapperUtils.qualifier(term.getId(), term.getName(), getId()); } else if (Objects.isNull(term) && strict) { - return OafMapperUtils.unknown(getId(), getName()); + return OafMapperUtils.unknown(getId()); } else { - return OafMapperUtils.qualifier(termId, termId, getId(), getName()); + return OafMapperUtils.qualifier(termId, termId, getId()); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java index fc7175270..ccd2a7d1b 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/common/vocabulary/VocabularyGroup.java @@ -125,12 +125,12 @@ public class VocabularyGroup implements Serializable { if (vocabularyExists(vocId)) { return vocs.get(vocId.toLowerCase()).getTermAsQualifier(id); } - return OafMapperUtils.qualifier(id, id, "", ""); + return OafMapperUtils.qualifier(id, id, ""); } public Qualifier getSynonymAsQualifier(final String vocId, final String syn) { if (StringUtils.isBlank(vocId)) { - return OafMapperUtils.unknown("", ""); + return OafMapperUtils.unknown(""); } return vocs.get(vocId.toLowerCase()).getSynonymAsQualifier(syn); } @@ -142,7 +142,7 @@ public class VocabularyGroup implements Serializable { */ public Qualifier getSynonymAsQualifierCaseSensitive(final String vocId, final String syn) { if (StringUtils.isBlank(vocId)) { - return OafMapperUtils.unknown("", ""); + return OafMapperUtils.unknown(""); } return vocs.get(vocId).getSynonymAsQualifier(syn); } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java index aea046203..aa3c857cf 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/AuthorMerger.java @@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils; import com.wcohen.ss.JaroWinkler; import eu.dnetlib.dhp.schema.oaf.Author; +import eu.dnetlib.dhp.schema.oaf.AuthorPid; import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.pace.model.Person; import scala.Tuple2; @@ -75,7 +76,7 @@ public class AuthorMerger { .collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1)); // (list of pid that are missing in the other list) - final List> pidToEnrich = enrich + final List> pidToEnrich = enrich .stream() .filter(a -> a.getPid() != null && !a.getPid().isEmpty()) .flatMap( @@ -111,7 +112,7 @@ public class AuthorMerger { // TERRIBLE HACK but for some reason when we create and Array with Arrays.asList, // it creates of fixed size, and the add method raise UnsupportedOperationException at // java.util.AbstractList.add - final List tmp = new ArrayList<>(r.getPid()); + final List tmp = new ArrayList<>(r.getPid()); tmp.add(a._1()); r.setPid(tmp); } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java index 3f65d754f..b74f895ff 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/DispatchEntitiesSparkJob.java @@ -21,8 +21,8 @@ import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.common.HdfsSupport; +import eu.dnetlib.dhp.schema.oaf.Entity; import eu.dnetlib.dhp.schema.oaf.Oaf; -import eu.dnetlib.dhp.schema.oaf.OafEntity; public class DispatchEntitiesSparkJob { @@ -58,7 +58,7 @@ public class DispatchEntitiesSparkJob { log.info("graphTableClassName: {}", graphTableClassName); @SuppressWarnings("unchecked") - Class entityClazz = (Class) Class.forName(graphTableClassName); + Class entityClazz = (Class) Class.forName(graphTableClassName); SparkConf conf = new SparkConf(); runWithSparkSession( diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java index e652bd5b6..bb5e727de 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/oa/merge/GroupEntitiesSparkJob.java @@ -87,17 +87,17 @@ public class GroupEntitiesSparkJob { String inputPath, String outputPath) { - final TypedColumn aggregator = new GroupingAggregator().toColumn(); + final TypedColumn aggregator = new GroupingAggregator().toColumn(); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); spark .read() .textFile(toSeq(listEntityPaths(inputPath, sc))) - .map((MapFunction) GroupEntitiesSparkJob::parseOaf, Encoders.kryo(OafEntity.class)) - .filter((FilterFunction) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e))) - .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) + .map((MapFunction) GroupEntitiesSparkJob::parseOaf, Encoders.kryo(Entity.class)) + .filter((FilterFunction) e -> StringUtils.isNotBlank(ModelSupport.idFn().apply(e))) + .groupByKey((MapFunction) oaf -> ModelSupport.idFn().apply(oaf), Encoders.STRING()) .agg(aggregator) .map( - (MapFunction, String>) t -> t._2().getClass().getName() + + (MapFunction, String>) t -> t._2().getClass().getName() + "|" + OBJECT_MAPPER.writeValueAsString(t._2()), Encoders.STRING()) .write() @@ -106,19 +106,19 @@ public class GroupEntitiesSparkJob { .text(outputPath); } - public static class GroupingAggregator extends Aggregator { + public static class GroupingAggregator extends Aggregator { @Override - public OafEntity zero() { + public Entity zero() { return null; } @Override - public OafEntity reduce(OafEntity b, OafEntity a) { + public Entity reduce(Entity b, Entity a) { return mergeAndGet(b, a); } - private OafEntity mergeAndGet(OafEntity b, OafEntity a) { + private Entity mergeAndGet(Entity b, Entity a) { if (Objects.nonNull(a) && Objects.nonNull(b)) { return OafMapperUtils.mergeEntities(b, a); } @@ -126,28 +126,28 @@ public class GroupEntitiesSparkJob { } @Override - public OafEntity merge(OafEntity b, OafEntity a) { + public Entity merge(Entity b, Entity a) { return mergeAndGet(b, a); } @Override - public OafEntity finish(OafEntity j) { + public Entity finish(Entity j) { return j; } @Override - public Encoder bufferEncoder() { - return Encoders.kryo(OafEntity.class); + public Encoder bufferEncoder() { + return Encoders.kryo(Entity.class); } @Override - public Encoder outputEncoder() { - return Encoders.kryo(OafEntity.class); + public Encoder outputEncoder() { + return Encoders.kryo(Entity.class); } } - private static OafEntity parseOaf(String s) { + private static Entity parseOaf(String s) { DocumentContext dc = JsonPath .parse(s, Configuration.defaultConfiguration().addOptions(Option.SUPPRESS_EXCEPTIONS)); @@ -184,7 +184,7 @@ public class GroupEntitiesSparkJob { } } - private static OafEntity parse(String s, Class clazz) { + private static Entity parse(String s, Class clazz) { try { return OBJECT_MAPPER.readValue(s, clazz); } catch (IOException e) { diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/AccessRightComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/AccessRightComparator.java new file mode 100644 index 000000000..6efd1c3dd --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/AccessRightComparator.java @@ -0,0 +1,69 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +import java.util.Comparator; + +public class AccessRightComparator implements Comparator { + + @Override + public int compare(T left, T right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals("OPEN SOURCE")) + return -1; + if (rClass.equals("OPEN SOURCE")) + return 1; + + if (lClass.equals("OPEN")) + return -1; + if (rClass.equals("OPEN")) + return 1; + + if (lClass.equals("6MONTHS")) + return -1; + if (rClass.equals("6MONTHS")) + return 1; + + if (lClass.equals("12MONTHS")) + return -1; + if (rClass.equals("12MONTHS")) + return 1; + + if (lClass.equals("EMBARGO")) + return -1; + if (rClass.equals("EMBARGO")) + return 1; + + if (lClass.equals("RESTRICTED")) + return -1; + if (rClass.equals("RESTRICTED")) + return 1; + + if (lClass.equals("CLOSED")) + return -1; + if (rClass.equals("CLOSED")) + return 1; + + if (lClass.equals("UNKNOWN")) + return -1; + if (rClass.equals("UNKNOWN")) + return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/EntityType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/EntityType.java new file mode 100644 index 000000000..81188fb11 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/EntityType.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +import eu.dnetlib.dhp.schema.oaf.Entity; + +/** Actual entity types in the Graph */ +public enum EntityType { + publication, dataset, otherresearchproduct, software, datasource, organization, project; + + /** + * Resolves the EntityType, given the relative class name + * + * @param clazz the given class name + * @param actual OafEntity subclass + * @return the EntityType associated to the given class + */ + public static EntityType fromClass(Class clazz) { + + return EntityType.valueOf(clazz.getSimpleName().toLowerCase()); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/MainEntityType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/MainEntityType.java new file mode 100644 index 000000000..0ed0b65fd --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/MainEntityType.java @@ -0,0 +1,7 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +/** Main entity types in the Graph */ +public enum MainEntityType { + result, datasource, organization, project +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupport.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupport.java new file mode 100644 index 000000000..8a86a293d --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupport.java @@ -0,0 +1,417 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +import com.github.sisyphsu.dateparser.DateParserUtils; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.*; +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; + +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.text.ParseException; +import java.util.Date; +import java.util.Map; +import java.util.Objects; +import java.util.Optional; +import java.util.function.Function; + +import static com.google.common.base.Preconditions.checkArgument; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; + +/** Oaf model utility methods. */ +public class ModelSupport { + + /** Defines the mapping between the actual entity type and the main entity type */ + private static final Map entityMapping = Maps.newHashMap(); + + static { + entityMapping.put(EntityType.publication, MainEntityType.result); + entityMapping.put(EntityType.dataset, MainEntityType.result); + entityMapping.put(EntityType.otherresearchproduct, MainEntityType.result); + entityMapping.put(EntityType.software, MainEntityType.result); + entityMapping.put(EntityType.datasource, MainEntityType.datasource); + entityMapping.put(EntityType.organization, MainEntityType.organization); + entityMapping.put(EntityType.project, MainEntityType.project); + } + + /** + * Defines the mapping between the actual entity types and the relative classes implementing them + */ + public static final Map entityTypes = Maps.newHashMap(); + + static { + entityTypes.put(EntityType.datasource, Datasource.class); + entityTypes.put(EntityType.organization, Organization.class); + entityTypes.put(EntityType.project, Project.class); + entityTypes.put(EntityType.dataset, Dataset.class); + entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class); + entityTypes.put(EntityType.software, Software.class); + entityTypes.put(EntityType.publication, Publication.class); + } + + public static final Map oafTypes = Maps.newHashMap(); + + static { + oafTypes.put("datasource", Datasource.class); + oafTypes.put("organization", Organization.class); + oafTypes.put("project", Project.class); + oafTypes.put("dataset", Dataset.class); + oafTypes.put("otherresearchproduct", OtherResearchProduct.class); + oafTypes.put("software", Software.class); + oafTypes.put("publication", Publication.class); + oafTypes.put("relation", Relation.class); + } + + public static final Map idPrefixMap = Maps.newHashMap(); + + static { + idPrefixMap.put(Datasource.class, "10"); + idPrefixMap.put(Organization.class, "20"); + idPrefixMap.put(Project.class, "40"); + idPrefixMap.put(Dataset.class, "50"); + idPrefixMap.put(OtherResearchProduct.class, "50"); + idPrefixMap.put(Software.class, "50"); + idPrefixMap.put(Publication.class, "50"); + } + + public static final Map entityIdPrefix = Maps.newHashMap(); + + static { + entityIdPrefix.put("datasource", "10"); + entityIdPrefix.put("organization", "20"); + entityIdPrefix.put("project", "40"); + entityIdPrefix.put("result", "50"); + } + + public static final Map idPrefixEntity = Maps.newHashMap(); + + static { + idPrefixEntity.put("10", "datasource"); + idPrefixEntity.put("20", "organization"); + idPrefixEntity.put("40", "project"); + idPrefixEntity.put("50", "result"); + } + + public static final Map relationInverseMap = Maps.newHashMap(); + + static { + set(relationInverseMap, PROJECT_ORGANIZATION, PARTICIPATION, IS_PARTICIPANT, HAS_PARTICIPANT); + + set(relationInverseMap, RESULT_ORGANIZATION, AFFILIATION, IS_AUTHOR_INSTITUTION_OF, HAS_AUTHOR_INSTITUTION); + + set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_MERGED_IN, MERGES); + set(relationInverseMap, ORG_ORG_RELTYPE, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO); + + set(relationInverseMap, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, PRODUCES); + + set(relationInverseMap, DATASOURCE_ORGANIZATION, PROVISION, IS_PROVIDED_BY, PROVIDES); + + set(relationInverseMap, RESULT_RESULT, SIMILARITY, IS_AMONG_TOP_N_SIMILAR_DOCS, HAS_AMONG_TOP_N_SIMILAR_DOCS); + set(relationInverseMap, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, IS_SUPPLEMENTED_BY); + set(relationInverseMap, RESULT_RESULT, PART, IS_PART_OF, HAS_PART); + set(relationInverseMap, RESULT_RESULT, DEDUP, IS_MERGED_IN, MERGES); + set(relationInverseMap, RESULT_RESULT, DEDUP, IS_SIMILAR_TO, IS_SIMILAR_TO); + set(relationInverseMap, RESULT_RESULT, CITATION, IS_CITED_BY, CITES); + + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_IDENTICAL_TO, IS_IDENTICAL_TO); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REFERENCED_BY, REFERENCES); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_CONTINUED_BY, CONTINUES); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DOCUMENTED_BY, DOCUMENTS); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DERIVED_FROM, IS_SOURCE_OF); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, IS_RELATED_TO); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_COMPILED_BY, COMPILES); + + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_DESCRIBED_BY, DESCRIBES); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_METADATA_FOR, IS_METADATA_OF); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, HAS_ASSOCIATION_WITH, HAS_ASSOCIATION_WITH); + set(relationInverseMap, RESULT_RESULT, RELATIONSHIP, IS_REQUIRED_BY, REQUIRES); + + + set(relationInverseMap, RESULT_RESULT, VERSION, IS_PREVIOUS_VERSION_OF, IS_NEW_VERSION_OF); + set(relationInverseMap, RESULT_RESULT, VERSION, IS_VARIANT_FORM_OF, IS_ORIGINAL_FORM_OF); + set(relationInverseMap, RESULT_RESULT, VERSION, IS_OBSOLETED_BY, OBSOLETES); + set(relationInverseMap, RESULT_RESULT, VERSION, IS_VERSION_OF, HAS_VERSION); + + set(relationInverseMap, RESULT_RESULT, REVIEW, IS_REVIEWED_BY, REVIEWS); + } + + private static void set(Map relationInverseMap, String relType, String subRelType, String relClass, String inverseRelClass) { + relationInverseMap + .put( + rel(relType, subRelType, relClass), new RelationInverse() + .setInverseRelClass(inverseRelClass) + .setRelClass(relClass) + .setRelType(relType) + .setSubReltype(subRelType)); + if (!relClass.equals(inverseRelClass)) { + relationInverseMap + .put( + rel(relType, subRelType, inverseRelClass), new RelationInverse() + .setInverseRelClass(relClass) + .setRelClass(inverseRelClass) + .setRelType(relType) + .setSubReltype(subRelType)); + } + } + + /** + * Helper method: lookup relation inverse, given the direct relation encoding (case insensitive) + * @param encoding + * @return the relation inverse descriptor, throws @IllegalArgumentException when not found. + */ + public static RelationInverse findInverse(String encoding) { + return ModelSupport.relationInverseMap + .entrySet() + .stream() + .filter(r -> encoding.equalsIgnoreCase(r.getKey())) + .findFirst() + .map(r -> r.getValue()) + .orElseThrow(() -> new IllegalArgumentException("invalid relationship: " + encoding)); + } + + /** + * Helper method: fina a relation filtering by a relation name + * @param relationName + * @return + */ + public static RelationInverse findRelation(final String relationName) { + return relationInverseMap.values() + .stream() + .filter(r -> relationName.equalsIgnoreCase(r.getRelClass())) + .findFirst() + .orElse(null); + } + + /** + * Helper method: combines the relation attributes + * @param relType + * @param subRelType + * @param relClass + * @return + */ + public static String rel(String relType, String subRelType, String relClass) { + return String.format("%s_%s_%s", relType, subRelType, relClass); + } + + private static final String schemeTemplate = "dnet:%s_%s_relations"; + + public static final String DATE_FORMAT = "yyyy-MM-dd"; + + private ModelSupport() { + } + + public static String getIdPrefix(Class clazz) { + return idPrefixMap.get(clazz); + } + + /** + * Checks subclass-superclass relationship. + * + * @param subClazzObject Subclass object instance + * @param superClazzObject Superclass object instance + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + X subClazzObject, Y superClazzObject) { + return isSubClass(subClazzObject.getClass(), superClazzObject.getClass()); + } + + /** + * Checks subclass-superclass relationship. + * + * @param subClazzObject Subclass object instance + * @param superClazz Superclass class + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + X subClazzObject, Class superClazz) { + return isSubClass(subClazzObject.getClass(), superClazz); + } + + /** + * Checks subclass-superclass relationship. + * + * @param subClazz Subclass class + * @param superClazz Superclass class + * @param Subclass type + * @param Superclass type + * @return True if X is a subclass of Y + */ + public static Boolean isSubClass( + Class subClazz, Class superClazz) { + return superClazz.isAssignableFrom(subClazz); + } + + /** + * Lists all the OAF model classes + * + * @param + * @return + */ + public static Class[] getOafModelClasses() { + return new Class[] { + AccessRight.class, + Author.class, + AuthorPid.class, + Context.class, + Country.class, + DataInfo.class, + Dataset.class, + Datasource.class, + Entity.class, + EntityDataInfo.class, + EoscIfGuidelines.class, + ExternalReference.class, + ExtraInfo.class, + GeoLocation.class, + H2020Classification.class, + H2020Programme.class, + Instance.class, + Journal.class, + KeyValue.class, + License.class, + Measure.class, + OAIProvenance.class, + OpenAccessRoute.class, + Organization.class, + OriginDescription.class, + OtherResearchProduct.class, + Project.class, + Provenance.class, + Publication.class, + Publisher.class, + Qualifier.class, + Relation.class, + Result.class, + Software.class, + StructuredProperty.class, + Subject.class + }; + } + + public static String getMainType(final EntityType type) { + return entityMapping.get(type).name(); + } + + public static boolean isResult(EntityType type) { + return MainEntityType.result.name().equals(getMainType(type)); + } + + public static String getScheme(final String sourceType, final String targetType) { + return String + .format( + schemeTemplate, + entityMapping.get(EntityType.valueOf(sourceType)).name(), + entityMapping.get(EntityType.valueOf(targetType)).name()); + } + + public static String tableIdentifier(String dbName, String tableName) { + + checkArgument(StringUtils.isNotBlank(dbName), "DB name cannot be empty"); + checkArgument(StringUtils.isNotBlank(tableName), "table name cannot be empty"); + + return String.format("%s.%s", dbName, tableName); + } + + public static String tableIdentifier(String dbName, Class clazz) { + + checkArgument(Objects.nonNull(clazz), "clazz is needed to derive the table name, thus cannot be null"); + + return tableIdentifier(dbName, clazz.getSimpleName().toLowerCase()); + } + + public static Function idFn() { + return x -> { + if (isSubClass(x, Relation.class)) { + return idFnForRelation(x); + } + return idFnForOafEntity(x); + }; + } + + private static String idFnForRelation(T t) { + Relation r = (Relation) t; + return Optional + .ofNullable(r.getSource()) + .map( + source -> Optional + .ofNullable(r.getTarget()) + .map( + target -> Optional + .ofNullable(r.getRelType()) + .map( + relType -> Optional + .ofNullable(r.getSubRelType()) + .map( + subRelType -> Optional + .ofNullable(r.getRelClass()) + .map( + relClass -> String + .join( + source, + target, + relType, + subRelType, + relClass)) + .orElse( + String + .join( + source, + target, + relType, + subRelType))) + .orElse(String.join(source, target, relType))) + .orElse(String.join(source, target))) + .orElse(source)) + .orElse(null); + } + + private static String idFnForOafEntity(T t) { + return ((Entity) t).getId(); + } + + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes(StandardCharsets.UTF_8)); + return new String(Hex.encodeHex(md.digest())); + } catch (final NoSuchAlgorithmException e) { + throw new IllegalStateException(e); + } + } + + public static String generateIdentifier(final String originalId, final String nsPrefix) { + return String.format("%s::%s", nsPrefix, md5(originalId)); + } + + public static String oldest(String dateA, String dateB) throws ParseException { + + if (StringUtils.isBlank(dateA)) { + return dateB; + } + if (StringUtils.isBlank(dateB)) { + return dateA; + } + if (StringUtils.isNotBlank(dateA) && StringUtils.isNotBlank(dateB)) { + + final Date a = DateParserUtils.parseDate(dateA); + final Date b = DateParserUtils.parseDate(dateB); + + if (Objects.nonNull(a) && Objects.nonNull(b)) { + return a.before(b) ? dateA : dateB; + } else { + return null; + } + } else { + return null; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RefereedComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RefereedComparator.java new file mode 100644 index 000000000..a1d712385 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RefereedComparator.java @@ -0,0 +1,45 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.Qualifier; + +import java.util.Comparator; + +public class RefereedComparator implements Comparator { + + @Override + public int compare(Qualifier left, Qualifier right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + String lClass = left.getClassid(); + String rClass = right.getClassid(); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(ModelConstants.PEER_REVIEWED_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.PEER_REVIEWED_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.NON_PEER_REVIEWED_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.NON_PEER_REVIEWED_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.UNKNOWN)) + return -1; + if (rClass.equals(ModelConstants.UNKNOWN)) + return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RelationInverse.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RelationInverse.java new file mode 100644 index 000000000..27a5c3411 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/common/RelationInverse.java @@ -0,0 +1,46 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +public class RelationInverse { + private String relClass; + private String inverseRelClass; + private String relType; + private String subReltype; + + public String getRelType() { + return relType; + } + + public RelationInverse setRelType(String relType) { + this.relType = relType; + return this; + } + + public String getSubReltype() { + return subReltype; + } + + public RelationInverse setSubReltype(String subReltype) { + this.subReltype = subReltype; + return this; + } + + public String getRelClass() { + return relClass; + } + + public RelationInverse setRelClass(String relClass) { + this.relClass = relClass; + return this; + } + + public String getInverseRelClass() { + return inverseRelClass; + } + + public RelationInverse setInverseRelClass(String inverseRelClass) { + this.inverseRelClass = inverseRelClass; + return this; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/CleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/CleaningFunctions.java new file mode 100644 index 000000000..c0ef339bd --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/CleaningFunctions.java @@ -0,0 +1,76 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.HashSet; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import org.apache.commons.lang3.StringUtils; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class CleaningFunctions { + + public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)"; + public static final String DOI_PREFIX = "10."; + + public static final Set PID_BLACKLIST = new HashSet<>(); + + static { + PID_BLACKLIST.add("none"); + PID_BLACKLIST.add("na"); + } + + public CleaningFunctions() { + } + + /** + * Utility method that filter PID values on a per-type basis. + * @param s the PID whose value will be checked. + * @return false if the pid matches the filter criteria, true otherwise. + */ + public static boolean pidFilter(StructuredProperty s) { + final String pidValue = s.getValue(); + if (Objects.isNull(s.getQualifier()) || + StringUtils.isBlank(pidValue) || + StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) { + return false; + } + if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) { + return false; + } + return !PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue); + } + + /** + * Utility method that normalises PID values on a per-type basis. + * @param pid the PID whose value will be normalised. + * @return the PID containing the normalised value. + */ + public static StructuredProperty normalizePidValue(StructuredProperty pid) { + pid + .setValue( + normalizePidValue( + pid.getQualifier().getClassid(), + pid.getValue())); + + return pid; + } + + public static String normalizePidValue(String pidType, String pidValue) { + String value = Optional + .ofNullable(pidValue) + .map(String::trim) + .orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty")); + + switch (pidType) { + + // TODO add cleaning for more PID types as needed + case "doi": + return value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX); + } + return value; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java index b24daaa5d..d9e1e20b5 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/GraphCleaningFunctions.java @@ -13,6 +13,8 @@ import java.util.stream.Collectors; import java.util.stream.Stream; import org.apache.commons.lang3.StringUtils; +import org.apache.spark.api.java.function.MapFunction; +import org.apache.spark.sql.Encoders; import com.github.sisyphsu.dateparser.DateParserUtils; import com.google.common.collect.Lists; @@ -23,8 +25,6 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; import me.xuender.unidecode.Unidecode; -import org.apache.spark.api.java.function.MapFunction; -import org.apache.spark.sql.Encoders; public class GraphCleaningFunctions extends CleaningFunctions { @@ -91,48 +91,31 @@ public class GraphCleaningFunctions extends CleaningFunctions { } public static boolean filter(T value) { - if (Boolean.TRUE - .equals( - Optional - .ofNullable(value) - .map( - o -> Optional - .ofNullable(o.getDataInfo()) - .map( - d -> Optional - .ofNullable(d.getInvisible()) - .orElse(true)) - .orElse(true)) - .orElse(true))) { - return true; - } - - if (value instanceof Datasource) { - // nothing to evaluate here - } else if (value instanceof Project) { - // nothing to evaluate here - } else if (value instanceof Organization) { - // nothing to evaluate here - } else if (value instanceof Relation) { - // nothing to clean here - } else if (value instanceof Result) { - - Result r = (Result) value; - - if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) { - return false; - } - - if (value instanceof Publication) { - - } else if (value instanceof Dataset) { - - } else if (value instanceof OtherResearchProduct) { - - } else if (value instanceof Software) { + if (value instanceof Entity) { + Entity entity = (Entity) value; + if (Boolean.TRUE + .equals( + Optional + .ofNullable(entity) + .map( + o -> Optional + .ofNullable(o.getDataInfo()) + .map( + d -> Optional + .ofNullable(d.getInvisible()) + .orElse(true)) + .orElse(true)) + .orElse(true))) { + return true; + } else if (value instanceof Result) { + Result r = (Result) value; + if (Objects.isNull(r.getTitle()) || r.getTitle().isEmpty()) { + return false; + } } } + return true; } @@ -164,7 +147,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (Objects.nonNull(r.getDateofacceptance())) { Optional date = cleanDateField(r.getDateofacceptance()); if (date.isPresent()) { - r.getDateofacceptance().setValue(date.get()); + r.setDateofacceptance(date.get()); } else { r.setDateofacceptance(null); } @@ -185,7 +168,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .filter(sp -> StringUtils.isNotBlank(sp.getValue())) .collect(Collectors.toList())); } - if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) { + if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getName())) { r.setPublisher(null); } if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) { @@ -267,7 +250,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { .getDescription() .stream() .filter(Objects::nonNull) - .filter(sp -> StringUtils.isNotBlank(sp.getValue())) + .filter(s -> StringUtils.isNotBlank(s)) .map(GraphCleaningFunctions::cleanValue) .collect(Collectors.toList())); } @@ -288,29 +271,25 @@ public class GraphCleaningFunctions extends CleaningFunctions { .setInstancetype( OafMapperUtils .qualifier( - "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); + "0038", "Other literature type", ModelConstants.DNET_PUBLICATION_RESOURCE)); } else if (r instanceof Dataset) { i .setInstancetype( OafMapperUtils .qualifier( - "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); + "0039", "Other dataset type", ModelConstants.DNET_PUBLICATION_RESOURCE)); } else if (r instanceof Software) { i .setInstancetype( OafMapperUtils .qualifier( - "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); + "0040", "Other software type", ModelConstants.DNET_PUBLICATION_RESOURCE)); } else if (r instanceof OtherResearchProduct) { i .setInstancetype( OafMapperUtils .qualifier( - "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE, - ModelConstants.DNET_PUBLICATION_RESOURCE)); + "0020", "Other ORP type", ModelConstants.DNET_PUBLICATION_RESOURCE)); } } @@ -348,7 +327,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { if (Objects.nonNull(i.getDateofacceptance())) { Optional date = cleanDateField(i.getDateofacceptance()); if (date.isPresent()) { - i.getDateofacceptance().setValue(date.get()); + i.setDateofacceptance(date.get()); } else { i.setDateofacceptance(null); } @@ -456,10 +435,9 @@ public class GraphCleaningFunctions extends CleaningFunctions { return value; } - private static Optional cleanDateField(Field dateofacceptance) { + private static Optional cleanDateField(String dateofacceptance) { return Optional .ofNullable(dateofacceptance) - .map(Field::getValue) .map(GraphCleaningFunctions::cleanDate) .filter(Objects::nonNull); } @@ -513,7 +491,6 @@ public class GraphCleaningFunctions extends CleaningFunctions { private static void fixVocabName(Qualifier q, String vocabularyName) { if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) { q.setSchemeid(vocabularyName); - q.setSchemename(vocabularyName); } } @@ -524,9 +501,7 @@ public class GraphCleaningFunctions extends CleaningFunctions { } private static Qualifier qualifier(String classid, String classname, String scheme) { - return OafMapperUtils - .qualifier( - classid, classname, scheme, scheme); + return OafMapperUtils.qualifier(classid, classname, scheme); } protected static StructuredProperty cleanValue(StructuredProperty s) { @@ -539,9 +514,8 @@ public class GraphCleaningFunctions extends CleaningFunctions { return s; } - protected static Field cleanValue(Field s) { - s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " ")); - return s; + protected static String cleanValue(String s) { + return s.replaceAll(CLEANING_REGEX, " "); } } diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java new file mode 100644 index 000000000..cba65b02a --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactory.java @@ -0,0 +1,294 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static com.google.common.base.Preconditions.checkArgument; +import static eu.dnetlib.dhp.schema.common.ModelConstants.*; + +import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.security.MessageDigest; +import java.util.*; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import eu.dnetlib.dhp.schema.oaf.common.ModelSupport; +import org.apache.commons.codec.binary.Hex; +import org.apache.commons.lang3.StringUtils; + +import com.google.common.collect.HashBiMap; +import com.google.common.collect.Maps; + +import eu.dnetlib.dhp.schema.oaf.*; + +/** + * Factory class for OpenAIRE identifiers in the Graph + */ +public class IdentifierFactory implements Serializable { + + public static final String ID_SEPARATOR = "::"; + public static final String ID_PREFIX_SEPARATOR = "|"; + + public static final int ID_PREFIX_LEN = 12; + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE. + * The id of the record (source_::id) will be rewritten as pidType_::id) + */ + public static final Map> PID_AUTHORITY = Maps.newHashMap(); + + static { + PID_AUTHORITY.put(PidType.doi, HashBiMap.create()); + PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref"); + PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite"); + PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "ZENODO"); + PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "Zenodo"); + + PID_AUTHORITY.put(PidType.pmc, HashBiMap.create()); + PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); + PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central"); + + PID_AUTHORITY.put(PidType.pmid, HashBiMap.create()); + PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central"); + PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central"); + + PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create()); + PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive"); + + PID_AUTHORITY.put(PidType.w3id, HashBiMap.create()); + PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ROHub"); + } + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, PID SUBSTRING] considered as delegated authority for that + * PID_TYPE. Example, Zenodo is delegated to forge DOIs that contain the 'zenodo' word. + * + * If a record with the same id (same pid) comes from 2 data sources, the one coming from a delegated source wins. E.g. Zenodo records win over those from Datacite. + * See also https://code-repo.d4science.org/D-Net/dnet-hadoop/pulls/187 and the class dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java + */ + public static final Map> DELEGATED_PID_AUTHORITY = Maps.newHashMap(); + + static { + DELEGATED_PID_AUTHORITY.put(PidType.doi, new HashMap<>()); + DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_OD_ID, "zenodo"); + DELEGATED_PID_AUTHORITY.get(PidType.doi).put(ZENODO_R3_ID, "zenodo"); + DELEGATED_PID_AUTHORITY.put(PidType.w3id, new HashMap<>()); + DELEGATED_PID_AUTHORITY.get(PidType.w3id).put(ROHUB_ID, "ro-id"); + } + + /** + * Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] whose records are considered enrichment for the graph. + * Their OpenAIRE ID is built from the declared PID type. Are merged with their corresponding record, identified by + * the same OpenAIRE id. + */ + public static final Map> ENRICHMENT_PROVIDER = Maps.newHashMap(); + + static { + ENRICHMENT_PROVIDER.put(PidType.doi, HashBiMap.create()); + ENRICHMENT_PROVIDER.get(PidType.doi).put(OPEN_APC_ID, OPEN_APC_NAME); + } + + public static Set delegatedAuthorityDatasourceIds() { + return DELEGATED_PID_AUTHORITY + .values() + .stream() + .flatMap(m -> m.keySet().stream()) + .collect(Collectors.toCollection(HashSet::new)); + } + + public static List getPids(List pid, KeyValue collectedFrom) { + return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList()); + } + + public static String createDOIBoostIdentifier(T entity) { + if (entity == null) + return null; + + StructuredProperty pid = null; + if (entity.getPid() != null) { + pid = entity + .getPid() + .stream() + .filter(Objects::nonNull) + .filter(s -> s.getQualifier() != null && "doi".equalsIgnoreCase(s.getQualifier().getClassid())) + .filter(CleaningFunctions::pidFilter) + .findAny() + .orElse(null); + } else { + if (entity.getInstance() != null) { + pid = entity + .getInstance() + .stream() + .filter(i -> i.getPid() != null) + .flatMap(i -> i.getPid().stream()) + .filter(CleaningFunctions::pidFilter) + .findAny() + .orElse(null); + } + } + if (pid != null) + return idFromPid(entity, pid, true); + return null; + } + + /** + * Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given + * entity T. Returns entity.id when none of the PIDs meet the selection criteria is available. + * + * @param entity the entity providing PIDs and a default ID. + * @param the specific entity type. Currently Organization and Result subclasses are supported. + * @param md5 indicates whether should hash the PID value or not. + * @return an identifier from the most relevant PID, entity.id otherwise + */ + public static String createIdentifier(T entity, boolean md5) { + + checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); + + final Map> pids = extractPids(entity); + + return pids + .values() + .stream() + .flatMap(Set::stream) + .min(new PidComparator<>(entity)) + .map( + min -> Optional + .ofNullable(pids.get(min.getQualifier().getClassid())) + .map( + p -> p + .stream() + .sorted(new PidValueComparator()) + .findFirst() + .map(s -> idFromPid(entity, s, md5)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId)) + .orElseGet(entity::getId); + } + + private static Map> extractPids(T entity) { + if (entity instanceof Result) { + return Optional + .ofNullable(((Result) entity).getInstance()) + .map(IdentifierFactory::mapPids) + .orElse(new HashMap<>()); + } else { + return entity + .getPid() + .stream() + .map(CleaningFunctions::normalizePidValue) + .filter(CleaningFunctions::pidFilter) + .collect( + Collectors + .groupingBy( + p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); + } + } + + private static Map> mapPids(List instance) { + return instance + .stream() + .map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false)) + .flatMap(Function.identity()) + .collect( + Collectors + .groupingBy( + p -> p.getQualifier().getClassid(), + Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); + } + + private static Stream pidFromInstance(List pid, KeyValue collectedFrom, + boolean mapHandles) { + return Optional + .ofNullable(pid) + .map( + pp -> pp + .stream() + // filter away PIDs provided by a DS that is not considered an authority for the + // given PID Type + .filter(p -> shouldFilterPidByCriteria(collectedFrom, p, mapHandles)) + .map(CleaningFunctions::normalizePidValue) + .filter(p -> isNotFromDelegatedAuthority(collectedFrom, p)) + .filter(CleaningFunctions::pidFilter)) + .orElse(Stream.empty()); + } + + private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) { + final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); + + if (Objects.isNull(collectedFrom)) { + return false; + } + + boolean isEnrich = Optional + .ofNullable(ENRICHMENT_PROVIDER.get(pType)) + .map( + enrich -> enrich.containsKey(collectedFrom.getKey()) + || enrich.containsValue(collectedFrom.getValue())) + .orElse(false); + + boolean isAuthority = Optional + .ofNullable(PID_AUTHORITY.get(pType)) + .map( + authorities -> authorities.containsKey(collectedFrom.getKey()) + || authorities.containsValue(collectedFrom.getValue())) + .orElse(false); + + return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority; + } + + private static boolean isNotFromDelegatedAuthority(KeyValue collectedFrom, StructuredProperty p) { + final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); + + final Map da = DELEGATED_PID_AUTHORITY.get(pType); + if (Objects.isNull(da)) { + return true; + } + if (!da.containsKey(collectedFrom.getKey())) { + return true; + } + return StringUtils.contains(p.getValue(), da.get(collectedFrom.getKey())); + } + + /** + * @see {@link IdentifierFactory#createIdentifier(Entity, boolean)} + */ + public static String createIdentifier(T entity) { + + return createIdentifier(entity, true); + } + + private static String idFromPid(T entity, StructuredProperty s, boolean md5) { + return idFromPid(ModelSupport.getIdPrefix(entity.getClass()), s.getQualifier().getClassid(), s.getValue(), md5); + } + + public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) { + return new StringBuilder() + .append(numericPrefix) + .append(ID_PREFIX_SEPARATOR) + .append(createPrefix(pidType)) + .append(ID_SEPARATOR) + .append(md5 ? md5(pidValue) : pidValue) + .toString(); + } + + // create the prefix (length = 12) + private static String createPrefix(String pidType) { + StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN)); + while (prefix.length() < ID_PREFIX_LEN) { + prefix.append("_"); + } + return prefix.substring(0, ID_PREFIX_LEN); + } + + public static String md5(final String s) { + try { + final MessageDigest md = MessageDigest.getInstance("MD5"); + md.update(s.getBytes(StandardCharsets.UTF_8)); + return new String(Hex.encodeHex(md.digest())); + } catch (final Exception e) { + return null; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeBeanUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeBeanUtils.java new file mode 100644 index 000000000..a318f991c --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeBeanUtils.java @@ -0,0 +1,104 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.lang.reflect.InvocationTargetException; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.beanutils.BeanUtilsBean; + +public class MergeBeanUtils { + + /** + * Copies all properties from sources to destination, does not copy null values and any nested objects will attempted to be + * either cloned or copied into the existing object. This is recursive. Should not cause any infinite recursion. + * @param dest object to copy props into (will mutate) + * @param sources + * @param dest + * @return + * @throws IllegalAccessException + * @throws InvocationTargetException + */ + public static T mergeIn(T dest, T... sources) { + // to keep from any chance infinite recursion lets limit each object to 1 instance at a time in the stack + final List lookingAt = new ArrayList<>(); + + BeanUtilsBean recursiveBeanUtils = new BeanUtilsBean() { + + /** + * Check if the class name is an internal one + * @param name + * @return + */ + private boolean isInternal(String name) { + return name.startsWith("java.") || name.startsWith("javax.") + || name.startsWith("com.sun.") || name.startsWith("javax.") + || name.startsWith("oracle."); + } + + /** + * Override to ensure that we dont end up in infinite recursion + * @param dest + * @param orig + * @throws IllegalAccessException + * @throws InvocationTargetException + */ + @Override + public void copyProperties(Object dest, Object orig) + throws IllegalAccessException, InvocationTargetException { + try { + // if we have an object in our list, that means we hit some sort of recursion, stop here. + if (lookingAt.stream().anyMatch(o -> o == dest)) { + return; // recursion detected + } + lookingAt.add(dest); + super.copyProperties(dest, orig); + } finally { + lookingAt.remove(dest); + } + } + + @Override + public void copyProperty(Object dest, String name, Object value) + throws IllegalAccessException, InvocationTargetException { + + if ("resulttype".equals(name)) { + return; + } else if (value != null) { + // dont copy over null values + // attempt to check if the value is a pojo we can clone using nested calls + if (!value.getClass().isPrimitive() && !value.getClass().isSynthetic() + && !isInternal(value.getClass().getName())) { + try { + Object prop = super.getPropertyUtils().getProperty(dest, name); + // get current value, if its null then clone the value and set that to the value + if (prop == null) { + super.setProperty(dest, name, super.cloneBean(value)); + } else { + // get the destination value and then recursively call + copyProperties(prop, value); + } + } catch (NoSuchMethodException e) { + return; + } catch (InstantiationException e) { + throw new RuntimeException("Nested property could not be cloned.", e); + } + } else { + super.copyProperty(dest, name, value); + } + } + } + }; + + for (Object source : sources) { + try { + recursiveBeanUtils.copyProperties(dest, source); + } catch (IllegalAccessException | InvocationTargetException e) { + throw new RuntimeException(e); + } + } + + return dest; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java new file mode 100644 index 000000000..eb4765093 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils.java @@ -0,0 +1,661 @@ +package eu.dnetlib.dhp.schema.oaf.utils; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.*; + +import eu.dnetlib.dhp.schema.oaf.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.oaf.common.ModelSupport; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; + +import java.text.ParseException; +import java.util.*; +import java.util.stream.Collectors; + +import static com.google.common.base.Preconditions.checkArgument; + +public class MergeUtils { + + public static Result mergeResult(Result original, Result enrich) { + + final Result mergedResult = (Result) mergeEntity(original, enrich); + + if(StringUtils.isBlank(mergedResult.getProcessingchargeamount())){ + mergedResult.setProcessingchargeamount(enrich.getProcessingchargeamount()); + mergedResult.setProcessingchargecurrency(enrich.getProcessingchargecurrency()); + } + + mergedResult.setMeasures(mergeLists(mergedResult.getMeasures(), enrich.getMeasures())); + + if( !isAnEnrichment(mergedResult) && !isAnEnrichment(enrich)) + mergedResult.setInstance(mergeLists(mergedResult.getInstance(), enrich.getInstance())); + else { + final List enrichmentInstances = isAnEnrichment(mergedResult) ? mergedResult.getInstance() : enrich.getInstance(); + final List enrichedInstances= isAnEnrichment(mergedResult) ? enrich.getInstance(): mergedResult.getInstance(); + if (isAnEnrichment(mergedResult)) + mergedResult.setDataInfo(enrich.getDataInfo()); + mergedResult.setInstance(enrichInstances(enrichedInstances,enrichmentInstances)); + } + + if (enrich.getBestaccessright() != null + && new AccessRightComparator<>().compare(enrich.getBestaccessright(), mergedResult.getBestaccessright()) < 0) + mergedResult.setBestaccessright(enrich.getBestaccessright()); + + final int trustCompareResult = compareTrust(mergedResult, enrich); + + if (enrich.getResulttype() != null && trustCompareResult < 0) + mergedResult.setResulttype(enrich.getResulttype()); + + if (enrich.getLanguage() != null && trustCompareResult < 0) + mergedResult.setLanguage(enrich.getLanguage()); + + if (Objects.nonNull(enrich.getDateofacceptance())) { + if (Objects.isNull(mergedResult.getDateofacceptance())) { + mergedResult.setDateofacceptance(enrich.getDateofacceptance()); + } else if (trustCompareResult < 0) { + mergedResult.setDateofacceptance(enrich.getDateofacceptance()); + } + } + + mergedResult.setCountry(mergeLists(mergedResult.getCountry(), enrich.getCountry())); + + mergedResult.setSubject(mergeLists(mergedResult.getSubject(), enrich.getSubject())); + + if (enrich.getJournal() != null && trustCompareResult < 0) + mergedResult.setJournal(enrich.getJournal()); + + // merge title lists: main title with higher trust and distinct between the others + StructuredProperty baseMainTitle = null; + if (mergedResult.getTitle() != null) { + baseMainTitle = getMainTitle(mergedResult.getTitle()); + if (baseMainTitle != null) { + final StructuredProperty p = baseMainTitle; + mergedResult.setTitle(mergedResult.getTitle().stream().filter(t -> t != p).collect(Collectors.toList())); + } + } + + StructuredProperty newMainTitle = null; + if (enrich.getTitle() != null) { + newMainTitle = getMainTitle(enrich.getTitle()); + if (newMainTitle != null) { + final StructuredProperty p = newMainTitle; + enrich.setTitle(enrich.getTitle().stream().filter(t -> t != p).collect(Collectors.toList())); + } + } + + if (newMainTitle != null && trustCompareResult < 0) { + baseMainTitle = newMainTitle; + } + + mergedResult.setTitle(mergeLists(mergedResult.getTitle(), enrich.getTitle())); + if (mergedResult.getTitle() != null && baseMainTitle != null) { + mergedResult.getTitle().add(baseMainTitle); + } + + mergedResult.setRelevantdate(mergeLists(mergedResult.getRelevantdate(), enrich.getRelevantdate())); + + mergedResult.setDescription(longestLists(mergedResult.getDescription(), enrich.getDescription())); + + if (enrich.getPublisher() != null && trustCompareResult < 0) + mergedResult.setPublisher(enrich.getPublisher()); + + if (enrich.getEmbargoenddate() != null && trustCompareResult < 0) + mergedResult.setEmbargoenddate(enrich.getEmbargoenddate()); + + mergedResult.setSource(mergeLists(mergedResult.getSource(), enrich.getSource())); + + mergedResult.setFulltext(mergeLists(mergedResult.getFulltext(), enrich.getFulltext())); + + mergedResult.setFormat(mergeLists(mergedResult.getFormat(), enrich.getFormat())); + + mergedResult.setContributor(mergeLists(mergedResult.getContributor(), enrich.getContributor())); + + if (enrich.getResourcetype() != null) + mergedResult.setResourcetype(enrich.getResourcetype()); + + mergedResult.setCoverage(mergeLists(mergedResult.getCoverage(), enrich.getCoverage())); + + mergedResult.setContext(mergeLists(mergedResult.getContext(), enrich.getContext())); + + mergedResult.setExternalReference(mergeLists(mergedResult.getExternalReference(), enrich.getExternalReference())); + + if (enrich.getOaiprovenance() != null && trustCompareResult < 0) + mergedResult.setOaiprovenance(enrich.getOaiprovenance()); + + return mergedResult; + } + + public static OtherResearchProduct mergeORP(OtherResearchProduct original, OtherResearchProduct enrich) { + final OtherResearchProduct mergedORP = (OtherResearchProduct) mergeResult(original, enrich); + + mergedORP.setContactperson(mergeLists(mergedORP.getContactperson(), enrich.getContactperson())); + mergedORP.setContactgroup(mergeLists(mergedORP.getContactgroup(), enrich.getContactgroup())); + mergedORP.setTool(mergeLists(mergedORP.getTool(), enrich.getTool())); + mergeEntityDataInfo(mergedORP, enrich); + + return mergedORP; + } + + public static Software mergeSoftware(Software original, Software enrich) { + final Software mergedSoftware = (Software) mergeResult(original, enrich); + + mergedSoftware.setDocumentationUrl(mergeLists(mergedSoftware.getDocumentationUrl(), enrich.getDocumentationUrl())); + + mergedSoftware.setCodeRepositoryUrl(enrich.getCodeRepositoryUrl() != null && compareTrust(mergedSoftware,enrich) < 0 + ? enrich.getCodeRepositoryUrl() + : mergedSoftware.getCodeRepositoryUrl()); + + mergedSoftware.setProgrammingLanguage(enrich.getProgrammingLanguage() != null && compareTrust(mergedSoftware, enrich) < 0 + ? enrich.getProgrammingLanguage() + : mergedSoftware.getProgrammingLanguage()); + + mergeEntityDataInfo(mergedSoftware, enrich); + return mergedSoftware; + } + + public static Dataset mergeDataset(Dataset original, Dataset enrich) { + + final Dataset mergedDataset = (Dataset) mergeResult(original, enrich); + + mergedDataset.setStoragedate(enrich.getStoragedate() != null && compareTrust(mergedDataset, enrich) < 0 ? enrich.getStoragedate() : mergedDataset.getStoragedate()); + + mergedDataset.setDevice(enrich.getDevice() != null && compareTrust(mergedDataset, enrich) < 0 ? enrich.getDevice() : mergedDataset.getDevice()); + + mergedDataset.setSize(enrich.getSize() != null && compareTrust(mergedDataset, enrich) < 0 ? enrich.getSize() : mergedDataset.getSize()); + + mergedDataset.setVersion(enrich.getVersion() != null && compareTrust(mergedDataset, enrich) < 0 ? enrich.getVersion() : mergedDataset.getVersion()); + + mergedDataset.setLastmetadataupdate( + enrich.getLastmetadataupdate() != null && compareTrust(mergedDataset,enrich) < 0 + ? enrich.getLastmetadataupdate() + : mergedDataset.getLastmetadataupdate()); + + mergedDataset.setMetadataversionnumber( + enrich.getMetadataversionnumber() != null && compareTrust(mergedDataset, enrich) < 0 + ? enrich.getMetadataversionnumber() + : mergedDataset.getMetadataversionnumber()); + + mergedDataset.setGeolocation(mergeLists(mergedDataset.getGeolocation(), enrich.getGeolocation())); + + mergeEntityDataInfo(mergedDataset, enrich); + + return mergedDataset; + } + + public static Publication mergePublication(Publication original, Publication enrich) { + + final Publication mergedPublication = (Publication) mergeResult(original, enrich); + + mergeEntityDataInfo(mergedPublication, enrich); + return mergedPublication; + } + + public static Oaf mergeOrganization(Organization original, Organization enrich) { + + final Organization mergedOrganization = (Organization) mergeEntity(original, enrich); + + int ct = compareTrust(mergedOrganization, enrich); + mergedOrganization.setLegalshortname(enrich.getLegalshortname() != null && ct < 0 + ? enrich.getLegalshortname() + : mergedOrganization.getLegalname()); + + + mergedOrganization.setLegalname(enrich.getLegalname() != null && ct < 0 ? + enrich.getLegalname() + : mergedOrganization.getLegalname()); + + mergedOrganization.setAlternativeNames(mergeLists(enrich.getAlternativeNames(), mergedOrganization.getAlternativeNames())); + + + mergedOrganization.setWebsiteurl(enrich.getWebsiteurl() != null && ct < 0 + ? enrich.getWebsiteurl() + : mergedOrganization.getWebsiteurl()); + + mergedOrganization.setLogourl(enrich.getLogourl() != null && ct < 0 + ? enrich.getLogourl() + : mergedOrganization.getLogourl()); + + mergedOrganization.setEclegalbody(enrich.getEclegalbody() != null && ct < 0 + ? enrich.getEclegalbody() + : mergedOrganization.getEclegalbody()); + + mergedOrganization.setEclegalperson(enrich.getEclegalperson() != null && ct < 0 + ? enrich.getEclegalperson() + : mergedOrganization.getEclegalperson()); + + mergedOrganization.setEcnonprofit (enrich.getEcnonprofit() != null && ct< 0 + ? enrich.getEcnonprofit() + : mergedOrganization.getEcnonprofit()); + + mergedOrganization.setEcresearchorganization (enrich.getEcresearchorganization() != null && ct < 0 + ? enrich.getEcresearchorganization() + : mergedOrganization.getEcresearchorganization()); + + mergedOrganization.setEchighereducation (enrich.getEchighereducation() != null && ct < 0 + ? enrich.getEchighereducation() + : mergedOrganization.getEchighereducation()); + + mergedOrganization.setEcinternationalorganizationeurinterests (enrich.getEcinternationalorganizationeurinterests() != null && ct< 0 + ? enrich.getEcinternationalorganizationeurinterests() + : mergedOrganization.getEcinternationalorganizationeurinterests()); + + mergedOrganization.setEcinternationalorganization (enrich.getEcinternationalorganization() != null && ct < 0 + ? enrich.getEcinternationalorganization() + : mergedOrganization.getEcinternationalorganization()); + + mergedOrganization.setEcenterprise (enrich.getEcenterprise() != null && ct < 0 + ? enrich.getEcenterprise() + : mergedOrganization.getEcenterprise()); + + mergedOrganization.setEcsmevalidated (enrich.getEcsmevalidated() != null && ct < 0 + ? enrich.getEcsmevalidated() + : mergedOrganization.getEcsmevalidated()); + mergedOrganization.setEcnutscode( enrich.getEcnutscode() != null && ct < 0 + ? enrich.getEcnutscode() + : mergedOrganization.getEcnutscode()); + + mergedOrganization.setCountry (enrich.getCountry() != null && ct < 0 ? + enrich.getCountry() + :mergedOrganization.getCountry()); + + mergeEntityDataInfo(mergedOrganization, enrich); + + return mergedOrganization; + } + + public static Oaf mergeOAFProject(Project original, Project enrich) { + + final Project mergedProject = (Project) mergeEntity(original, enrich); + + int ct = compareTrust(mergedProject, enrich); + + + mergedProject.setWebsiteurl (enrich.getWebsiteurl() != null && ct < 0 + ? enrich.getWebsiteurl() + : mergedProject.getWebsiteurl()); + + mergedProject.setCode(enrich.getCode() != null && ct < 0 ? + enrich.getCode() : + mergedProject.getCode()); + + mergedProject.setAcronym(enrich.getAcronym() != null && ct < 0 + ? enrich.getAcronym() + : mergedProject.getAcronym()); + + mergedProject.setTitle (enrich.getTitle() != null && ct < 0 + ? enrich.getTitle() + : mergedProject.getTitle()); + mergedProject.setStartdate (enrich.getStartdate() != null && ct < 0 + ? enrich.getStartdate() + : mergedProject.getStartdate()); + mergedProject.setEnddate (enrich.getEnddate() != null && ct < 0 + ? enrich.getEnddate() + : mergedProject.getEnddate()); + mergedProject.setCallidentifier ( enrich.getCallidentifier() != null && ct < 0 + ? enrich.getCallidentifier() + : mergedProject.getCallidentifier()); + mergedProject.setKeywords ( enrich.getKeywords() != null && ct < 0 + ? enrich.getKeywords() + : mergedProject.getKeywords()); + + mergedProject.setDuration ( enrich.getDuration() != null && ct < 0 + ? enrich.getDuration() + : mergedProject.getDuration()); + mergedProject.setEcsc39 ( enrich.getEcsc39() != null && ct < 0 + ? enrich.getEcsc39() : + mergedProject.getEcsc39()); + mergedProject.setOamandatepublications ( enrich.getOamandatepublications() != null && ct < 0 + ? enrich.getOamandatepublications() + : mergedProject.getOamandatepublications()); + mergedProject.setEcarticle29_3 (enrich.getEcarticle29_3() != null && ct < 0 + ? enrich.getEcarticle29_3() + : mergedProject.getEcarticle29_3()); + + mergedProject.setSubjects (mergeLists(mergedProject.getSubjects(), enrich.getSubjects())); + mergedProject.setFundingtree (mergeLists(mergedProject.getFundingtree(), enrich.getFundingtree())); + mergedProject.setContracttype (enrich.getContracttype() != null && ct < 0 + ? enrich.getContracttype() + : mergedProject.getContracttype()); + mergedProject.setOptional1 ( enrich.getOptional1() != null && ct < 0 + ? enrich.getOptional1() + : mergedProject.getOptional1()); + mergedProject.setOptional2 (enrich.getOptional2() != null && ct < 0 + ? enrich.getOptional2() + : mergedProject.getOptional2()); + + mergedProject.setJsonextrainfo ( enrich.getJsonextrainfo() != null && ct < 0 + ? enrich.getJsonextrainfo() + : mergedProject.getJsonextrainfo()); + + mergedProject.setContactfullname ( enrich.getContactfullname() != null && ct < 0 + ? enrich.getContactfullname() + : mergedProject.getContactfullname()); + + mergedProject.setContactfax ( enrich.getContactfax() != null && ct < 0 + ? enrich.getContactfax() + : mergedProject.getContactfax()); + + mergedProject.setContactphone (enrich.getContactphone() != null && ct < 0 + ? enrich.getContactphone() + : mergedProject.getContactphone()); + + mergedProject.setContactemail ( enrich.getContactemail() != null && ct < 0 + ? enrich.getContactemail() + : mergedProject.getContactemail()); + + mergedProject.setSummary ( enrich.getSummary() != null && ct < 0 + ? enrich.getSummary() + : mergedProject.getSummary()); + + mergedProject.setCurrency( enrich.getCurrency() != null && ct < 0 + ? enrich.getCurrency() + : mergedProject.getCurrency()); + + if (enrich.getH2020topiccode() != null && StringUtils.isEmpty(mergedProject.getH2020topiccode())){ + mergedProject.setH2020topiccode(enrich.getH2020topiccode()); + mergedProject.setH2020topicdescription(enrich.getH2020topicdescription()); + } + + mergedProject.setH2020classification(mergeLists(mergedProject.getH2020classification(), enrich.getH2020classification())); + + mergeEntityDataInfo(mergedProject, enrich); + + return mergedProject; + } + + private static Entity mergeEntity(Entity original, Entity enrich) { + + final Entity mergedEntity = original; + + mergedEntity.setOriginalId(mergeLists(mergedEntity.getOriginalId(), enrich.getOriginalId())); + mergedEntity.setCollectedfrom(mergeLists(mergedEntity.getCollectedfrom(), enrich.getCollectedfrom())); + + if (mergedEntity.getLastupdatetimestamp() == null && enrich.getLastupdatetimestamp() != null) { + mergedEntity.setLastupdatetimestamp(enrich.getLastupdatetimestamp()); + } else if (mergedEntity.getLastupdatetimestamp() != null && enrich.getLastupdatetimestamp() != null) { + mergedEntity.setLastupdatetimestamp(Long.max(mergedEntity.getLastupdatetimestamp(), enrich.getLastupdatetimestamp())); + } + + mergedEntity.setPid(mergeLists(mergedEntity.getPid(), enrich.getPid())); + + final int trustCompareResult = compareTrust(mergedEntity, enrich); + if (enrich.getDateofcollection() != null && trustCompareResult < 0) + mergedEntity.setDateofcollection(enrich.getDateofcollection()); + + if (enrich.getDateoftransformation() != null && trustCompareResult < 0) + mergedEntity.setDateoftransformation(enrich.getDateoftransformation()); + + mergedEntity.setMeasures(mergeLists(mergedEntity.getMeasures(), enrich.getMeasures())); + mergedEntity.setExtraInfo(mergeLists(mergedEntity.getExtraInfo(), enrich.getExtraInfo())); + + return mergedEntity; + } + + public static Relation mergeRelation(Relation original, Relation enrich) { + + checkArgument(Objects.equals(original.getSource(), enrich.getSource()), "source ids must be equal"); + checkArgument(Objects.equals(original.getTarget(), enrich.getTarget()), "target ids must be equal"); + checkArgument(Objects.equals(original.getRelType(), enrich.getRelType()), "relType(s) must be equal"); + checkArgument( + Objects.equals(original.getSubRelType(), enrich.getSubRelType()), "subRelType(s) must be equal"); + checkArgument(Objects.equals(original.getRelClass(), enrich.getRelClass()), "relClass(es) must be equal"); + + original.setProvenance(mergeLists(original.getProvenance(), enrich.getProvenance())); + + original.setValidated(original.getValidated() || enrich.getValidated()); + try { + original.setValidationDate(ModelSupport.oldest(original.getValidationDate(), enrich.getValidationDate())); + } catch (ParseException e) { + throw new IllegalArgumentException(String + .format( + "invalid validation date format in relation [s:%s, t:%s]: %s", original.getSource(), original.getTarget(), + original.getValidationDate())); + } + + return original; + } + + private static void mergeEntityDataInfo(Entity from, Entity to) { + Optional.ofNullable(to) + .ifPresent(other -> Optional.ofNullable(other.getDataInfo()) + .ifPresent(otherDataInfo -> Optional.ofNullable(from.getDataInfo()) + .ifPresent(thisDataInfo -> { + if (compareTrust(from, other) < 0 || thisDataInfo.getInvisible()) { + from.setDataInfo(otherDataInfo); + } + }))); + } + + /** + * Gets main title. + * + * @param titles the titles + * @return the main title + */ + private static StructuredProperty getMainTitle(List titles) { + // need to check if the list of titles contains more than 1 main title? (in that case, we should chose which + // main title select in the list) + for (StructuredProperty t : titles) { + if (t.getQualifier() != null && t.getQualifier().getClassid() != null) + if (t.getQualifier().getClassid().equals("main title")) + return t; + } + return null; + } + + /** + * Longest lists list. + * + * @param a the a + * @param b the b + * @return the list + */ + public static List longestLists(List a, List b) { + if (a == null || b == null) + return a == null ? b : a; + if (a.size() == b.size()) { + int msa = a + .stream() + .filter(i -> i != null) + .map(i -> i.length()) + .max(Comparator.naturalOrder()) + .orElse(0); + int msb = b + .stream() + .filter(i -> i != null ) + .map(i -> i.length()) + .max(Comparator.naturalOrder()) + .orElse(0); + return msa > msb ? a : b; + } + return a.size() > b.size() ? a : b; + } + + /** + * This main method apply the enrichment of the instances + * + * @param toEnrichInstances the instances that could be enriched + * @param enrichmentInstances the enrichment instances + * @return list of instances possibly enriched + */ + private static List enrichInstances(final List toEnrichInstances,final List enrichmentInstances) { + final List enrichmentResult = new ArrayList<>(); + + if (toEnrichInstances == null) { + return enrichmentResult; + } + if (enrichmentInstances == null) { + return enrichmentResult; + } + Map ri = toInstanceMap(enrichmentInstances); + + toEnrichInstances.forEach(i -> { + final List e = findEnrichmentsByPID(i.getPid(), ri); + if (e!= null && e.size()> 0) { + e.forEach(enr -> applyEnrichment(i, enr)); + } else { + final List a = findEnrichmentsByPID(i.getAlternateIdentifier(), ri); + if (a!= null && a.size()> 0) { + a.forEach(enr -> applyEnrichment(i, enr)); + } + } + enrichmentResult.add(i); + }); + return enrichmentResult; + } + + /** + * This method converts the list of instance enrichments + * into a Map where the key is the normalized identifier + * and the value is the instance itself + * + * @param ri the list of enrichment instances + * @return the result map + */ + private static Map toInstanceMap(final List ri) { + return ri + .stream() + .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) + .flatMap(i -> { + final List> result = new ArrayList<>(); + if (i.getPid() != null) + i.getPid().stream().filter(MergeUtils::validPid).forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + if (i.getAlternateIdentifier() != null) + i.getAlternateIdentifier().stream().filter(MergeUtils::validPid).forEach(p -> result.add(new ImmutablePair<>(extractKeyFromPid(p), i))); + return result.stream(); + }).collect(Collectors.toMap( + Pair::getLeft, + Pair::getRight, + (a, b) -> a + )); + } + + /** + * Valid pid boolean. + * + * @param p the p + * @return the boolean + */ + private static boolean validPid(final StructuredProperty p) { + return p.getValue()!= null && p.getQualifier()!= null && p.getQualifier().getClassid()!=null; + } + + /** + * Normalize pid string. + * + * @param pid the pid + * @return the string + */ + private static String extractKeyFromPid(final StructuredProperty pid) { + if (pid == null) + return null; + final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid); + + return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue()); + } + + /** + * This utility method finds the list of enrichment instances + * that match one or more PIDs in the input list + * + * @param pids the list of PIDs + * @param enrichments the List of enrichment instances having the same pid + * @return the list + */ + private static List findEnrichmentsByPID(final List pids, final Map enrichments) { + if (pids == null || enrichments == null) + return null; + return pids + .stream() + .map(MergeUtils::extractKeyFromPid) + .map(enrichments::get) + .filter(Objects::nonNull) + .collect(Collectors.toList()); + } + + /** + * Is an enrichment boolean. + * + * @param e the e + * @return the boolean + */ + public static boolean isAnEnrichment(Entity e) { + return e.getDataInfo() != null && + e.getDataInfo().getProvenanceaction()!= null + && ModelConstants.PROVENANCE_ENRICH.equalsIgnoreCase(e.getDataInfo().getProvenanceaction().getClassid()); + } + + /** + * This method apply enrichment on a single instance + * The enrichment consists of replacing values on + * single attribute only if in the current instance is missing + * The only repeatable field enriched is measures + * + * @param currentInstance the current instance + * @param enrichment the enrichment instance + */ + private static void applyEnrichment(final Instance currentInstance, final Instance enrichment) { + if (currentInstance == null || enrichment == null) + return; + + //ENRICH accessright + if (enrichment.getAccessright()!=null && currentInstance.getAccessright() == null) + currentInstance.setAccessright(enrichment.getAccessright()); + + //ENRICH license + if (enrichment.getLicense()!=null && currentInstance.getLicense() == null) + currentInstance.setLicense(enrichment.getLicense()); + + //ENRICH instanceType + if (enrichment.getInstancetype()!=null && currentInstance.getInstancetype() == null) + currentInstance.setInstancetype(enrichment.getInstancetype()); + + //ENRICH hostedby + if (enrichment.getHostedby()!=null && currentInstance.getHostedby() == null) + currentInstance.setHostedby(enrichment.getHostedby()); + + //ENRICH distributionlocation + if (enrichment.getDistributionlocation()!=null && currentInstance.getDistributionlocation() == null) + currentInstance.setDistributionlocation(enrichment.getDistributionlocation()); + + //ENRICH collectedfrom + if (enrichment.getCollectedfrom()!=null && currentInstance.getCollectedfrom() == null) + currentInstance.setCollectedfrom(enrichment.getCollectedfrom()); + + //ENRICH dateofacceptance + if (enrichment.getDateofacceptance()!=null && currentInstance.getDateofacceptance() == null) + currentInstance.setDateofacceptance(enrichment.getDateofacceptance()); + + //ENRICH processingchargeamount + if (enrichment.getProcessingchargeamount()!=null && currentInstance.getProcessingchargeamount() == null) + currentInstance.setProcessingchargeamount(enrichment.getProcessingchargeamount()); + + //ENRICH refereed + if (enrichment.getRefereed()!=null && currentInstance.getRefereed() == null) + currentInstance.setRefereed(enrichment.getRefereed()); + + //TODO check the other Instance fields + } + + private static List mergeLists(final List... lists) { + return Arrays + .stream(lists) + .filter(Objects::nonNull) + .flatMap(List::stream) + .filter(Objects::nonNull) + .distinct() + .collect(Collectors.toList()); + } + + private static int compareTrust(Entity a, Entity b) { + return Float.compare( + Optional.ofNullable(a.getDataInfo()) + .map(DataInfo::getTrust) + .orElse(0f), + Optional.ofNullable(b.getDataInfo()) + .map(DataInfo::getTrust) + .orElse(0f)); + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils2.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils2.java new file mode 100644 index 000000000..60ea5bf1f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils2.java @@ -0,0 +1,156 @@ +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.lang.reflect.Field; +import java.util.Collection; +import java.util.Iterator; + +public class MergeUtils2 { + + /** + * Recursively merges the fields of the provider into the receiver. + * + * @param receiver the receiver instance. + * @param provider the provider instance. + */ + public static void merge(final T receiver, final T provider) { + Field[] fields = receiver.getClass().getDeclaredFields(); + for (Field field : fields) { + + try { + field.setAccessible(true); + Object receiverObject = field.get(receiver); + Object providerObject = field.get(provider); + + if (receiverObject == null || providerObject == null) { + /* One is null */ + + field.set(receiver, providerObject); + } else if (field.getType().isAssignableFrom(Collection.class)) { + /* Collection field */ + // noinspection rawtypes + mergeCollections((Collection) receiverObject, (Collection) providerObject); + } else if (field.getType().isPrimitive() || field.getType().isEnum() + || field.getType().equals(String.class)) { + /* Primitive, Enum or String field */ + field.set(receiver, providerObject); + } else { + /* Mergeable field */ + merge(receiverObject, providerObject); + } + } catch (IllegalAccessException e) { + /* Should not happen */ + throw new RuntimeException(e); + } + } + } + + /** + * Recursively merges the items in the providers collection into the receivers collection. + * Receivers not present in providers will be removed, providers not present in receivers will be added. + * If the item has a field called 'id', this field will be compared to match the items. + * + * @param receivers the collection containing the receiver instances. + * @param providers the collection containing the provider instances. + */ + public static void mergeCollections(final Collection receivers, final Collection providers) { + if (receivers.isEmpty() && providers.isEmpty()) { + return; + } + + if (providers.isEmpty()) { + receivers.clear(); + return; + } + + if (receivers.isEmpty()) { + receivers.addAll(providers); + return; + } + + Field idField; + try { + T t = providers.iterator().next(); + idField = t.getClass().getDeclaredField("id"); + idField.setAccessible(true); + } catch (NoSuchFieldException ignored) { + idField = null; + } + + try { + if (idField != null) { + mergeCollectionsWithId(receivers, providers, idField); + } else { + mergeCollectionsSimple(receivers, providers); + } + } catch (IllegalAccessException e) { + /* Should not happen */ + throw new RuntimeException(e); + } + } + + /** + * Recursively merges the items in the collections for which the id's are equal. + * + * @param receivers the collection containing the receiver items. + * @param providers the collection containing the provider items. + * @param idField the id field. + * + * @throws IllegalAccessException if the id field is not accessible. + */ + private static void mergeCollectionsWithId(final Collection receivers, final Iterable providers, + final Field idField) throws IllegalAccessException { + /* Find a receiver for each provider */ + for (T provider : providers) { + boolean found = false; + for (T receiver : receivers) { + if (idField.get(receiver).equals(idField.get(provider))) { + merge(receiver, provider); + found = true; + } + } + if (!found) { + receivers.add(provider); + } + } + + /* Remove receivers not in providers */ + for (Iterator iterator = receivers.iterator(); iterator.hasNext();) { + T receiver = iterator.next(); + boolean found = false; + for (T provider : providers) { + if (idField.get(receiver).equals(idField.get(provider))) { + found = true; + } + } + if (!found) { + iterator.remove(); + } + } + } + + /** + * Recursively merges the items in the collections one by one. Disregards equality. + * + * @param receivers the collection containing the receiver items. + * @param providers the collection containing the provider items. + */ + private static void mergeCollectionsSimple(final Collection receivers, final Iterable providers) { + Iterator receiversIterator = receivers.iterator(); + Iterator providersIterator = providers.iterator(); + while (receiversIterator.hasNext() && providersIterator.hasNext()) { + merge(receiversIterator.next(), providersIterator.next()); + } + + /* Remove excessive receivers if present */ + while (receiversIterator.hasNext()) { + receiversIterator.next(); + receiversIterator.remove(); + } + + /* Add residual providers to receivers if present */ + while (providersIterator.hasNext()) { + receivers.add(providersIterator.next()); + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils3.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils3.java new file mode 100644 index 000000000..cb3f67c8b --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/MergeUtils3.java @@ -0,0 +1,89 @@ +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.lang.reflect.Field; +import java.math.BigDecimal; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; + + +import static org.apache.commons.lang3.ClassUtils.isPrimitiveOrWrapper; + +public class MergeUtils3 { + + private final List selfObjects; + private final Object source; + private final Object target; + + private MergeUtils3(Object source, Object target) { + this.source = source; + this.target = target; + this.selfObjects = new ArrayList<>(); + } + + public static MergeUtils3 mergerOf(Object source, Object target) { + return new MergeUtils3(source, target); + } + + public final void merge() { + try { + merge(source, target); + } catch (IllegalAccessException | NoSuchFieldException e) { + throw new RuntimeException("Merge error: ", e); + } + } + + private void merge(Object source, Object target) throws IllegalAccessException, NoSuchFieldException { + selfObjects.add(source); + + Field[] declaredFields = source.getClass().getDeclaredFields(); + for (Field declaredField : declaredFields) { + declaredField.setAccessible(true); + + Object fieldValue = declaredField.get(source); + if (fieldValue == null || selfObjects.contains(fieldValue)) { + continue; + } + + Class declaredFieldType = declaredField.getType(); + if (isJdkType(declaredField)) { + Field targetField = target.getClass().getDeclaredField(declaredField.getName()); + targetField.setAccessible(true); + + targetField.set(target, fieldValue); + continue; + } + + if (Collection.class.isAssignableFrom(declaredFieldType)) { + Iterable sourceCollection = (Iterable) declaredField.get(source); + Iterable targetCollection = (Iterable) declaredField.get(target); + + merge(sourceCollection, targetCollection); + continue; + } + + merge(declaredField.get(source), declaredField.get(target)); + } + } + + private boolean isJdkType(Field field) { + Class declaredFieldType = field.getType(); + String fieldTypeName = declaredFieldType.getName(); + + return isPrimitiveOrWrapper(declaredFieldType) + || fieldTypeName.equals(String.class.getName()) + || fieldTypeName.equals(BigDecimal.class.getName()); + } + + private void merge(Iterable source, Iterable target) throws NoSuchFieldException, IllegalAccessException { + Iterator sourceIterator = source.iterator(); + Iterator targetIterator = target.iterator(); + + while (sourceIterator.hasNext()) { + merge(sourceIterator.next(), targetIterator.next()); + } + } +} + + diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java new file mode 100644 index 000000000..6014201af --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ModelHardLimits.java @@ -0,0 +1,25 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +public class ModelHardLimits { + + private ModelHardLimits() { + } + + public static final String LAYOUT = "index"; + public static final String INTERPRETATION = "openaire"; + public static final String SEPARATOR = "-"; + + public static final int MAX_EXTERNAL_ENTITIES = 50; + public static final int MAX_AUTHORS = 200; + public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; + public static final int MAX_TITLE_LENGTH = 5000; + public static final int MAX_TITLES = 10; + public static final int MAX_ABSTRACT_LENGTH = 150000; + public static final int MAX_INSTANCES = 10; + + public static String getCollectionName(String format) { + return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION; + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java index c58096d35..bd710e259 100644 --- a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtils.java @@ -11,11 +11,10 @@ import java.util.function.Function; import java.util.function.Predicate; import java.util.stream.Collectors; +import eu.dnetlib.dhp.schema.oaf.common.AccessRightComparator; +import eu.dnetlib.dhp.schema.oaf.common.ModelSupport; import org.apache.commons.lang3.StringUtils; -import eu.dnetlib.dhp.schema.common.AccessRightComparator; -import eu.dnetlib.dhp.schema.common.ModelConstants; -import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.oaf.*; public class OafMapperUtils { @@ -24,29 +23,25 @@ public class OafMapperUtils { } public static Oaf merge(final Oaf left, final Oaf right) { - if (ModelSupport.isSubClass(left, OafEntity.class)) { - return mergeEntities((OafEntity) left, (OafEntity) right); + if (ModelSupport.isSubClass(left, Entity.class)) { + return mergeEntities((Entity) left, (Entity) right); } else if (ModelSupport.isSubClass(left, Relation.class)) { - ((Relation) left).mergeFrom((Relation) right); + return MergeUtils.mergeRelation((Relation) left, (Relation) right); } else { throw new IllegalArgumentException("invalid Oaf type:" + left.getClass().getCanonicalName()); } - return left; } - public static OafEntity mergeEntities(OafEntity left, OafEntity right) { + public static Entity mergeEntities(Entity left, Entity right) { if (ModelSupport.isSubClass(left, Result.class)) { return mergeResults((Result) left, (Result) right); - } else if (ModelSupport.isSubClass(left, Datasource.class)) { - left.mergeFrom(right); - } else if (ModelSupport.isSubClass(left, Organization.class)) { - left.mergeFrom(right); - } else if (ModelSupport.isSubClass(left, Project.class)) { - left.mergeFrom(right); + } else if (ModelSupport.isSubClass(left, Datasource.class) || + ModelSupport.isSubClass(left, Organization.class) || + ModelSupport.isSubClass(left, Project.class)) { + return (Entity) merge(left, right); } else { - throw new IllegalArgumentException("invalid OafEntity subtype:" + left.getClass().getCanonicalName()); + throw new IllegalArgumentException("invalid Entity subtype:" + left.getClass().getCanonicalName()); } - return left; } public static Result mergeResults(Result left, Result right) { @@ -60,13 +55,10 @@ public class OafMapperUtils { if (!leftFromDelegatedAuthority && rightFromDelegatedAuthority) { return right; } - if (new ResultTypeComparator().compare(left, right) < 0) { - left.mergeFrom(right); - return left; + return MergeUtils.mergeResult(left, right); } else { - right.mergeFrom(left); - return right; + return MergeUtils.mergeResult(right, left); } } @@ -101,26 +93,6 @@ public class OafMapperUtils { return list; } - public static Field field(final T value, final DataInfo info) { - if (value == null || StringUtils.isBlank(value.toString())) { - return null; - } - - final Field field = new Field<>(); - field.setValue(value); - field.setDataInfo(info); - return field; - } - - public static List> listFields(final DataInfo info, final String... values) { - return Arrays - .stream(values) - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(Field::getValue)) - .collect(Collectors.toList()); - } - public static List listValues(Array values) throws SQLException { if (Objects.isNull(values)) { return null; @@ -132,17 +104,8 @@ public class OafMapperUtils { .collect(Collectors.toList()); } - public static List> listFields(final DataInfo info, final List values) { - return values - .stream() - .map(v -> field(v, info)) - .filter(Objects::nonNull) - .filter(distinctByKey(Field::getValue)) - .collect(Collectors.toList()); - } - - public static Qualifier unknown(final String schemeid, final String schemename) { - return qualifier(UNKNOWN, "Unknown", schemeid, schemename); + public static Qualifier unknown(final String schemeid) { + return qualifier(UNKNOWN, "Unknown", schemeid); } public static AccessRight accessRight( @@ -163,7 +126,6 @@ public class OafMapperUtils { accessRight.setClassid(classid); accessRight.setClassname(classname); accessRight.setSchemeid(schemeid); - accessRight.setSchemename(schemename); accessRight.setOpenAccessRoute(openAccessRoute); return accessRight; } @@ -171,13 +133,11 @@ public class OafMapperUtils { public static Qualifier qualifier( final String classid, final String classname, - final String schemeid, - final String schemename) { + final String schemeid) { final Qualifier q = new Qualifier(); q.setClassid(classid); q.setClassname(classname); q.setSchemeid(schemeid); - q.setSchemename(schemename); return q; } @@ -186,7 +146,6 @@ public class OafMapperUtils { q.setClassid(qualifier.getClassid()); q.setClassname(qualifier.getClassname()); q.setSchemeid(qualifier.getSchemeid()); - q.setSchemename(qualifier.getSchemename()); return q; } @@ -195,21 +154,18 @@ public class OafMapperUtils { final String classid, final String classname, final String schemeid, - final String schemename, final DataInfo dataInfo) { - return subject(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + return subject(value, qualifier(classid, classname, schemeid), dataInfo); } public static StructuredProperty structuredProperty( final String value, final String classid, final String classname, - final String schemeid, - final String schemename, - final DataInfo dataInfo) { + final String schemeid) { - return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo); + return structuredProperty(value, qualifier(classid, classname, schemeid)); } public static Subject subject( @@ -228,18 +184,42 @@ public class OafMapperUtils { public static StructuredProperty structuredProperty( final String value, - final Qualifier qualifier, - final DataInfo dataInfo) { + final Qualifier qualifier) { if (value == null) { return null; } final StructuredProperty sp = new StructuredProperty(); sp.setValue(value); sp.setQualifier(qualifier); - sp.setDataInfo(dataInfo); return sp; } + public static Publisher publisher(final String name) { + final Publisher p = new Publisher(); + p.setName(name); + return p; + } + + public static License license(final String url) { + final License l = new License(); + l.setUrl(url); + return l; + } + + public static AuthorPid authorPid( + final String value, + final Qualifier qualifier, + final DataInfo dataInfo) { + if (value == null) { + return null; + } + final AuthorPid ap = new AuthorPid(); + ap.setValue(value); + ap.setQualifier(qualifier); + ap.setDataInfo(dataInfo); + return ap; + } + public static ExtraInfo extraInfo( final String name, final String value, @@ -340,19 +320,32 @@ public class OafMapperUtils { } public static DataInfo dataInfo( - final Boolean deletedbyinference, + final float trust, final String inferenceprovenance, - final Boolean inferred, - final Boolean invisible, - final Qualifier provenanceaction, - final String trust) { + final boolean inferred, + final Qualifier provenanceaction) { final DataInfo d = new DataInfo(); + d.setTrust(trust); + d.setInferenceprovenance(inferenceprovenance); + d.setInferred(inferred); + d.setProvenanceaction(provenanceaction); + return d; + } + + public static EntityDataInfo dataInfo( + final boolean invisible, + final boolean deletedbyinference, + final float trust, + final String inferenceprovenance, + final boolean inferred, + final Qualifier provenanceaction) { + final EntityDataInfo d = new EntityDataInfo(); + d.setTrust(trust); + d.setInvisible(invisible); d.setDeletedbyinference(deletedbyinference); d.setInferenceprovenance(inferenceprovenance); d.setInferred(inferred); - d.setInvisible(invisible); d.setProvenanceaction(provenanceaction); - d.setTrust(trust); return d; } @@ -422,9 +415,6 @@ public class OafMapperUtils { if (StringUtils.isBlank(rights.getSchemeid())) { rights.setSchemeid(DNET_ACCESS_MODES); } - if (StringUtils.isBlank(rights.getSchemename())) { - rights.setSchemename(DNET_ACCESS_MODES); - } return rights; } @@ -433,7 +423,6 @@ public class OafMapperUtils { public static KeyValue newKeyValueInstance(String key, String value, DataInfo dataInfo) { KeyValue kv = new KeyValue(); - kv.setDataInfo(dataInfo); kv.setKey(key); kv.setValue(value); return kv; @@ -451,7 +440,7 @@ public class OafMapperUtils { final String relType, final String subRelType, final String relClass, - final OafEntity entity) { + final Entity entity) { return getRelation(source, target, relType, subRelType, relClass, entity, null); } @@ -460,11 +449,12 @@ public class OafMapperUtils { final String relType, final String subRelType, final String relClass, - final OafEntity entity, + final Entity entity, final String validationDate) { + + final List provenance = getProvenance(entity.getCollectedfrom(), entity.getDataInfo()); return getRelation( - source, target, relType, subRelType, relClass, entity.getCollectedfrom(), entity.getDataInfo(), - entity.getLastupdatetimestamp(), validationDate, null); + source, target, relType, subRelType, relClass, provenance, validationDate, null); } public static Relation getRelation(final String source, @@ -472,11 +462,9 @@ public class OafMapperUtils { final String relType, final String subRelType, final String relClass, - final List collectedfrom, - final DataInfo dataInfo, - final Long lastupdatetimestamp) { + final List provenance) { return getRelation( - source, target, relType, subRelType, relClass, collectedfrom, dataInfo, lastupdatetimestamp, null, null); + source, target, relType, subRelType, relClass, provenance, null, null); } public static Relation getRelation(final String source, @@ -484,9 +472,7 @@ public class OafMapperUtils { final String relType, final String subRelType, final String relClass, - final List collectedfrom, - final DataInfo dataInfo, - final Long lastupdatetimestamp, + final List provenance, final String validationDate, final List properties) { final Relation rel = new Relation(); @@ -495,15 +481,27 @@ public class OafMapperUtils { rel.setRelClass(relClass); rel.setSource(source); rel.setTarget(target); - rel.setCollectedfrom(collectedfrom); - rel.setDataInfo(dataInfo); - rel.setLastupdatetimestamp(lastupdatetimestamp); + rel.setProvenance(provenance); rel.setValidated(StringUtils.isNotBlank(validationDate)); rel.setValidationDate(StringUtils.isNotBlank(validationDate) ? validationDate : null); rel.setProperties(properties); return rel; } + public static List getProvenance(final List collectedfrom, final DataInfo dataInfo) { + return collectedfrom + .stream() + .map(cf -> getProvenance(cf, dataInfo)) + .collect(Collectors.toList()); + } + + public static Provenance getProvenance(final KeyValue collectedfrom, final DataInfo dataInfo) { + final Provenance prov = new Provenance(); + prov.setCollectedfrom(collectedfrom); + prov.setDataInfo(dataInfo); + return prov; + } + public static String getProvenance(DataInfo dataInfo) { return Optional .ofNullable(dataInfo) diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java new file mode 100644 index 000000000..3a6df2924 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/OrganizationPidComparator.java @@ -0,0 +1,38 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class OrganizationPidComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid()); + + if (lClass.equals(PidType.openorgs)) + return -1; + if (rClass.equals(PidType.openorgs)) + return 1; + + if (lClass.equals(PidType.GRID)) + return -1; + if (rClass.equals(PidType.GRID)) + return 1; + + if (lClass.equals(PidType.mag_id)) + return -1; + if (rClass.equals(PidType.mag_id)) + return 1; + + if (lClass.equals(PidType.urn)) + return -1; + if (rClass.equals(PidType.urn)) + return 1; + + return 0; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklist.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklist.java new file mode 100644 index 000000000..0b8e5e3f1 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklist.java @@ -0,0 +1,8 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.HashMap; +import java.util.HashSet; + +public class PidBlacklist extends HashMap> { +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklistProvider.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklistProvider.java new file mode 100644 index 000000000..21a254e69 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidBlacklistProvider.java @@ -0,0 +1,40 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.io.IOException; +import java.util.HashSet; +import java.util.Optional; +import java.util.Set; + +import org.apache.commons.io.IOUtils; + +import com.fasterxml.jackson.databind.ObjectMapper; + +public class PidBlacklistProvider { + + private static final PidBlacklist blacklist; + + static { + try { + String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json")); + blacklist = new ObjectMapper().readValue(json, PidBlacklist.class); + + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static PidBlacklist getBlacklist() { + return blacklist; + } + + public static Set getBlacklist(String pidType) { + return Optional + .ofNullable(getBlacklist().get(pidType)) + .orElse(new HashSet<>()); + } + + private PidBlacklistProvider() { + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java new file mode 100644 index 000000000..58df0a1bc --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidComparator.java @@ -0,0 +1,48 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.Entity; +import eu.dnetlib.dhp.schema.oaf.Organization; +import eu.dnetlib.dhp.schema.oaf.Result; +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; +import eu.dnetlib.dhp.schema.oaf.common.ModelSupport; + +public class PidComparator implements Comparator { + + private final T entity; + + public PidComparator(T entity) { + this.entity = entity; + } + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + if (ModelSupport.isSubClass(entity, Result.class)) { + return compareResultPids(left, right); + } + if (ModelSupport.isSubClass(entity, Organization.class)) { + return compareOrganizationtPids(left, right); + } + + // Else (but unlikely), lexicographical ordering will do. + return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid()); + } + + private int compareResultPids(StructuredProperty left, StructuredProperty right) { + return new ResultPidComparator().compare(left, right); + } + + private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) { + return new OrganizationPidComparator().compare(left, right); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java new file mode 100644 index 000000000..392bc02ea --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidType.java @@ -0,0 +1,79 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import org.apache.commons.lang3.EnumUtils; + +public enum PidType { + + /** + * The DOI syntax shall be made up of a DOI prefix and a DOI suffix separated by a forward slash. + * + * There is no defined limit on the length of the DOI name, or of the DOI prefix or DOI suffix. + * + * The DOI name is case-insensitive and can incorporate any printable characters from the legal graphic characters + * of Unicode. Further constraints on character use (e.g. use of language-specific alphanumeric characters) can be + * defined for an application by the ISO 26324 Registration Authority. + * + * + * DOI prefix: The DOI prefix shall be composed of a directory indicator followed by a registrant code. + * These two components shall be separated by a full stop (period). The directory indicator shall be "10" and + * distinguishes the entire set of character strings (prefix and suffix) as digital object identifiers within the + * resolution system. + * + * Registrant code: The second element of the DOI prefix shall be the registrant code. The registrant code is a + * unique string assigned to a registrant. + * + * DOI suffix: The DOI suffix shall consist of a character string of any length chosen by the registrant. + * Each suffix shall be unique to the prefix element that precedes it. The unique suffix can be a sequential number, + * or it might incorporate an identifier generated from or based on another system used by the registrant + * (e.g. ISAN, ISBN, ISRC, ISSN, ISTC, ISNI; in such cases, a preferred construction for such a suffix can be + * specified, as in Example 1). + * + * Source: https://www.doi.org/doi_handbook/2_Numbering.html#2.2 + */ + doi, + + /** + * PubMed Unique Identifier (PMID) + * + * This field is a 1-to-8 digit accession number with no leading zeros. It is present on all records and is the + * accession number for managing and disseminating records. PMIDs are not reused after records are deleted. + * + * Beginning in February 2012 PMIDs include extensions following a decimal point to account for article versions + * (e.g., 21804956.2). All citations are considered version 1 until replaced. The extended PMID is not displayed + * on the MEDLINE format. + * + * View the citation in abstract format in PubMed to access additional versions when available (see the article in + * the Jan-Feb 2012 NLM Technical Bulletin). + * + * Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmid + */ + pmid, + + /** + * This field contains the unique identifier for the cited article in PubMed Central. The identifier begins with the + * prefix PMC. + * + * Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html#pmc + */ + pmc, handle, arXiv, nct, pdb, w3id, + + // Organization + openorgs, corda, corda_h2020, GRID, mag_id, urn, + + // Used by dedup + undefined, original; + + public static boolean isValid(String type) { + return EnumUtils.isValidEnum(PidType.class, type); + } + + public static PidType tryValueOf(String s) { + try { + return PidType.valueOf(s); + } catch (Exception e) { + return PidType.original; + } + } + +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java new file mode 100644 index 000000000..0e2083590 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/PidValueComparator.java @@ -0,0 +1,33 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Comparator; +import java.util.Optional; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class PidValueComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + StructuredProperty l = CleaningFunctions.normalizePidValue(left); + StructuredProperty r = CleaningFunctions.normalizePidValue(right); + + return Optional + .ofNullable(l.getValue()) + .map( + lv -> Optional + .ofNullable(r.getValue()) + .map(rv -> lv.compareTo(rv)) + .orElse(-1)) + .orElse(1); + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java new file mode 100644 index 000000000..e51c4801f --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultPidComparator.java @@ -0,0 +1,53 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Comparator; + +import eu.dnetlib.dhp.schema.oaf.StructuredProperty; + +public class ResultPidComparator implements Comparator { + + @Override + public int compare(StructuredProperty left, StructuredProperty right) { + + PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid()); + PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid()); + + if (lClass.equals(PidType.doi)) + return -1; + if (rClass.equals(PidType.doi)) + return 1; + + if (lClass.equals(PidType.pmid)) + return -1; + if (rClass.equals(PidType.pmid)) + return 1; + + if (lClass.equals(PidType.pmc)) + return -1; + if (rClass.equals(PidType.pmc)) + return 1; + + if (lClass.equals(PidType.handle)) + return -1; + if (rClass.equals(PidType.handle)) + return 1; + + if (lClass.equals(PidType.arXiv)) + return -1; + if (rClass.equals(PidType.arXiv)) + return 1; + + if (lClass.equals(PidType.nct)) + return -1; + if (rClass.equals(PidType.nct)) + return 1; + + if (lClass.equals(PidType.pdb)) + return -1; + if (rClass.equals(PidType.pdb)) + return 1; + + return 0; + } +} diff --git a/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java new file mode 100644 index 000000000..a233ae764 --- /dev/null +++ b/dhp-common/src/main/java/eu/dnetlib/dhp/schema/oaf/utils/ResultTypeComparator.java @@ -0,0 +1,77 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; + +import java.util.Comparator; +import java.util.HashSet; +import java.util.Optional; +import java.util.stream.Collectors; + +import eu.dnetlib.dhp.schema.common.ModelConstants; +import eu.dnetlib.dhp.schema.oaf.KeyValue; +import eu.dnetlib.dhp.schema.oaf.Result; + +public class ResultTypeComparator implements Comparator { + + @Override + public int compare(Result left, Result right) { + + if (left == null && right == null) + return 0; + if (left == null) + return 1; + if (right == null) + return -1; + + HashSet lCf = getCollectedFromIds(left); + HashSet rCf = getCollectedFromIds(right); + + if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) { + return -1; + } + if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) { + return 1; + } + + String lClass = left.getResulttype(); + String rClass = right.getResulttype(); + + if (lClass.equals(rClass)) + return 0; + + if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID)) + return 1; + + if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) + return -1; + if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID)) + return 1; + + // Else (but unlikely), lexicographical ordering will do. + return lClass.compareTo(rClass); + } + + protected HashSet getCollectedFromIds(Result left) { + return Optional + .ofNullable(left.getCollectedfrom()) + .map( + cf -> cf + .stream() + .map(KeyValue::getKey) + .collect(Collectors.toCollection(HashSet::new))) + .orElse(new HashSet<>()); + } +} diff --git a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala index a995016a8..65a7f43af 100644 --- a/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala +++ b/dhp-common/src/main/scala/eu/dnetlib/dhp/sx/graph/scholix/ScholixUtils.scala @@ -175,12 +175,11 @@ object ScholixUtils extends Serializable { } def extractCollectedFrom(relation: Relation): List[ScholixEntityId] = { - if (relation.getCollectedfrom != null && !relation.getCollectedfrom.isEmpty) { - - val l: List[ScholixEntityId] = relation.getCollectedfrom.asScala.map { c => + if (relation.getProvenance != null && !relation.getProvenance.isEmpty) { + val l: List[ScholixEntityId] = relation.getProvenance.asScala.map { p => new ScholixEntityId( - c.getValue, - List(new ScholixIdentifier(c.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava + p.getCollectedfrom.getValue, + List(new ScholixIdentifier(p.getCollectedfrom.getKey, DNET_IDENTIFIER_SCHEMA, null)).asJava ) }.toList l @@ -402,15 +401,15 @@ object ScholixUtils extends Serializable { .getInstance() .asScala .filter(i => i.getDateofacceptance != null) - .map(i => i.getDateofacceptance.getValue) + .map(i => i.getDateofacceptance) .toList if (dt.nonEmpty) s.setDate(dt.distinct.asJava) } if (r.getDescription != null && !r.getDescription.isEmpty) { - val d = r.getDescription.asScala.find(f => f != null && f.getValue != null) + val d = r.getDescription.asScala.find(f => f != null) if (d.isDefined) - s.setDescription(d.get.getValue) + s.setDescription(d.get) } if (r.getSubject != null && !r.getSubject.isEmpty) { @@ -422,7 +421,7 @@ object ScholixUtils extends Serializable { } if (r.getPublisher != null) - s.setPublisher(List(r.getPublisher.getValue).asJava) + s.setPublisher(List(r.getPublisher.getName).asJava) if (r.getCollectedfrom != null && !r.getCollectedfrom.isEmpty) { val cf: List[CollectedFromType] = r.getCollectedfrom.asScala diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupportTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupportTest.java new file mode 100644 index 000000000..300b20f88 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/common/ModelSupportTest.java @@ -0,0 +1,52 @@ + +package eu.dnetlib.dhp.schema.oaf.common; + +import eu.dnetlib.dhp.schema.oaf.Entity; +import eu.dnetlib.dhp.schema.oaf.Relation; +import eu.dnetlib.dhp.schema.oaf.Result; +import org.junit.jupiter.api.Nested; +import org.junit.jupiter.api.Test; + +import java.io.IOException; + +import static org.junit.jupiter.api.Assertions.*; + +public class ModelSupportTest { + + @Nested + class IsSubClass { + + @Test + void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Relation.class, Entity.class); + + // then + assertFalse(result); + } + + @Test + void shouldReturnTrueWhenSubClassExtendsSuperClass() { + // when + Boolean result = ModelSupport.isSubClass(Result.class, Entity.class); + + // then + assertTrue(result); + } + } + + + @Nested + class InverseRelation { + + @Test + void findRelations() throws IOException { + assertNotNull(ModelSupport.findRelation("isMetadataFor")); + assertNotNull(ModelSupport.findRelation("ismetadatafor")); + assertNotNull(ModelSupport.findRelation("ISMETADATAFOR")); + assertNotNull(ModelSupport.findRelation("isRelatedTo")); + + + } + } +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/BlackListProviderTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/BlackListProviderTest.java new file mode 100644 index 000000000..61d06a6ae --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/BlackListProviderTest.java @@ -0,0 +1,21 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import java.util.Set; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +class BlackListProviderTest { + + @Test + void blackListTest() { + + Assertions.assertNotNull(PidBlacklistProvider.getBlacklist()); + Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi")); + Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0); + final Set xxx = PidBlacklistProvider.getBlacklist("xxx"); + Assertions.assertNotNull(xxx); + Assertions.assertEquals(0, xxx.size()); + } +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java new file mode 100644 index 000000000..bce4b76b5 --- /dev/null +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/IdentifierFactoryTest.java @@ -0,0 +1,87 @@ + +package eu.dnetlib.dhp.schema.oaf.utils; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import java.io.IOException; + +import org.apache.commons.io.IOUtils; +import org.junit.jupiter.api.Test; + +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; + +import eu.dnetlib.dhp.schema.oaf.Publication; + +class IdentifierFactoryTest { + + private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + + @Test + void testCreateIdentifierForPublication() throws IOException { + + verifyIdentifier( + "publication_doi1.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); + + verifyIdentifier( + "publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); + + verifyIdentifier( + "publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true); + + verifyIdentifier( + "publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true); + + verifyIdentifier( + "publication_doi5.json", "50|doi_________::3bef95c0ca26dd55451fc8839ea69d27", true); + + verifyIdentifier( + "publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true); + + verifyIdentifier( + "publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true); + + verifyIdentifier( + "publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true); + + final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; + verifyIdentifier("publication_3.json", defaultID, true); + verifyIdentifier("publication_4.json", defaultID, true); + verifyIdentifier("publication_5.json", defaultID, true); + + } + + @Test + void testCreateIdentifierForPublicationNoHash() throws IOException { + + verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false); + verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false); + verifyIdentifier("publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false); + verifyIdentifier( + "publication_urn1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false); + + final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"; + verifyIdentifier("publication_3.json", defaultID, false); + verifyIdentifier("publication_4.json", defaultID, false); + verifyIdentifier("publication_5.json", defaultID, false); + } + + @Test + void testCreateIdentifierForROHub() throws IOException { + verifyIdentifier( + "orp-rohub.json", "50|w3id________::afc7592914ae190a50570db90f55f9c2", true); + } + + protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException { + final String json = IOUtils.toString(getClass().getResourceAsStream(filename)); + final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class); + + String id = IdentifierFactory.createIdentifier(pub, md5); + System.out.println(id); + assertNotNull(id); + assertEquals(expectedID, id); + } + +} diff --git a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java index 9111ac2df..5788d6519 100644 --- a/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java +++ b/dhp-common/src/test/java/eu/dnetlib/dhp/schema/oaf/utils/OafMapperUtilsTest.java @@ -164,24 +164,38 @@ class OafMapperUtilsTest { assertEquals(1, d2.getCollectedfrom().size()); assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); - assertEquals( - ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, - OafMapperUtils - .mergeResults(p1, d2) - .getResulttype() - .getClassid()); - assertEquals(1, p2.getCollectedfrom().size()); assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); assertEquals(1, d1.getCollectedfrom().size()); assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID)); - assertEquals( - ModelConstants.DATASET_RESULTTYPE_CLASSID, - OafMapperUtils - .mergeResults(p2, d1) - .getResulttype() - .getClassid()); + final Result p1d2 = OafMapperUtils.mergeResults(p1, d2); + assertEquals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID, p1d2.getResulttype()); + assertTrue(p1d2 instanceof Publication); + assertEquals(p1.getId(), p1d2.getId()); + } + + @Test + void testMergePubs_1() throws IOException { + Publication p2 = read("publication_2.json", Publication.class); + Dataset d1 = read("dataset_1.json", Dataset.class); + + final Result p2d1 = OafMapperUtils.mergeResults(p2, d1); + assertEquals(ModelConstants.DATASET_RESULTTYPE_CLASSID, p2d1.getResulttype()); + assertTrue(p2d1 instanceof Dataset); + assertEquals(d1.getId(), p2d1.getId()); + assertEquals(2, p2d1.getCollectedfrom().size()); + } + + @Test + void testMergePubs_2() throws IOException { + Publication p1 = read("publication_1.json", Publication.class); + Publication p2 = read("publication_2.json", Publication.class); + + Result p1p2 = OafMapperUtils.mergeResults(p1, p2); + assertTrue(p1p2 instanceof Publication); + assertEquals(p1.getId(), p1p2.getId()); + assertEquals(2, p1p2.getCollectedfrom().size()); } @Test diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json index e38c4d1cc..4f209e2e3 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_1.json @@ -1 +1,28 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", + "resuttype": "dataset", + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2011.03.013" + }, + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ], + "collectedfrom": [ + { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json index c880edb7d..beb0cef63 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_2.json @@ -1,6 +1,6 @@ { "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", - "resuttype": {"classid": "dataset"}, + "resuttype": "dataset", "pid": [ { "qualifier": {"classid": "doi"}, @@ -30,8 +30,7 @@ "refereed": { "classid": "0000", "classname": "UNKNOWN", - "schemeid": "dnet:review_levels", - "schemename": "dnet:review_levels" + "schemeid": "dnet:review_levels" }, "hostedby": { "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", @@ -40,45 +39,15 @@ "accessright": { "classid": "OPEN", "classname": "Open Access", - "schemeid": "dnet:access_modes", - "schemename": "dnet:access_modes" - }, - "processingchargecurrency": { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, - "value": "EUR" + "schemeid": "dnet:access_modes" }, + "processingchargecurrency": "EUR", "pid": [ { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, "qualifier": { "classid": "doi", "classname": "Digital Object Identifier", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" + "schemeid": "dnet:pid_types" }, "value": "10.1371/journal.pone.0085605" } @@ -87,24 +56,10 @@ "url": ["https://doi.org/10.1371/journal.pone.0085605"], "alternateIdentifier": [ { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, "qualifier": { "classid": "pmid", "classname": "PubMed ID", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" + "schemeid": "dnet:pid_types" }, "value": "24454899.0" } @@ -113,27 +68,11 @@ "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value": "Repository B" }, - "processingchargeamount": { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, - "value": "1022.02" - }, + "processingchargeamount": "1022.02", "instancetype": { "classid": "0004", "classname": "Conference object", - "schemeid": "dnet:publication_resource", - "schemename": "dnet:publication_resource" + "schemeid": "dnet:publication_resource" } } ] diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json index 967c1181b..d792dbcdd 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/dataset_delegated.json @@ -1,6 +1,6 @@ { "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", - "resuttype": {"classid": "dataset"}, + "resuttype": "dataset", "pid": [ { "qualifier": {"classid": "doi"}, @@ -30,8 +30,7 @@ "refereed": { "classid": "0000", "classname": "UNKNOWN", - "schemeid": "dnet:review_levels", - "schemename": "dnet:review_levels" + "schemeid": "dnet:review_levels" }, "hostedby": { "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", @@ -40,45 +39,15 @@ "accessright": { "classid": "OPEN", "classname": "Open Access", - "schemeid": "dnet:access_modes", - "schemename": "dnet:access_modes" - }, - "processingchargecurrency": { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, - "value": "EUR" + "schemeid": "dnet:access_modes" }, + "processingchargecurrency": "EUR", "pid": [ { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, "qualifier": { "classid": "doi", "classname": "Digital Object Identifier", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" + "schemeid": "dnet:pid_types" }, "value": "10.1371/journal.pone.0085605" } @@ -87,24 +56,10 @@ "url": ["https://doi.org/10.1371/journal.pone.0085605"], "alternateIdentifier": [ { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, "qualifier": { "classid": "pmid", "classname": "PubMed ID", - "schemeid": "dnet:pid_types", - "schemename": "dnet:pid_types" + "schemeid": "dnet:pid_types" }, "value": "24454899.0" } @@ -113,27 +68,11 @@ "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", "value": "Zenodo" }, - "processingchargeamount": { - "dataInfo": { - "provenanceaction": { - "classid": "sysimport:crosswalk:datasetarchive", - "classname": "Harvested", - "schemeid": "dnet:provenanceActions", - "schemename": "dnet:provenanceActions" - }, - "deletedbyinference": false, - "inferred": false, - "inferenceprovenance": "", - "invisible": true, - "trust": "0.9" - }, - "value": "1022.02" - }, + "processingchargeamount": "1022.02", "instancetype": { "classid": "0004", "classname": "Conference object", - "schemeid": "dnet:publication_resource", - "schemename": "dnet:publication_resource" + "schemeid": "dnet:publication_resource" } } ] diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/orp-rohub.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/orp-rohub.json new file mode 100644 index 000000000..c0f13ffbf --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/orp-rohub.json @@ -0,0 +1,197 @@ +{ + "collectedfrom": [ + { + "key": "10|fairsharing_::1b69ebedb522700034547abc5652ffac", + "value": "ROHub", + "dataInfo": null + } + ], + "dataInfo": { + "invisible": false, + "inferred": false, + "deletedbyinference": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions" + } + }, + "lastupdatetimestamp": 1663926081966, + "id": "50|w3id________::afc7592914ae190a50570db90f55f9c2", + "originalId": [ + "50|fsh_____4119::afc7592914ae190a50570db90f55f9c2", + "https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca" + ], + "pid": [ + { + "value": "https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", + "qualifier": { + "classid": "w3id", + "classname": "w3id.org", + "schemeid": "dnet:pid_types" + } + } + ], + "dateofcollection": "2019-03-27T15:15:22.22Z", + "dateoftransformation": "2019-04-17T16:04:20.586Z", + "extraInfo": [], + "oaiprovenance": null, + "processingchargeamount": null, + "processingchargecurrency": null, + "measures": null, + "author": [ + { + "fullname": "CNR-ISMAR", + "name": "", + "surname": "", + "rank": 1, + "pid": [] + } + ], + "resulttype": "other", + "language": { + "classid": "UNKNOWN", + "classname": "Unknown", + "schemeid": "dnet:languages" + }, + "country": [], + "subject": [ + { + "value": "Ecology", + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "" + }, + "dataInfo": { + "inferred": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions" + } + } + }, + { + "value": "EOSC::RO-crate", + "qualifier": { + "classid": "", + "classname": "", + "schemeid": "" + }, + "dataInfo": { + "inferred": false, + "trust": "0.9", + "inferenceprovenance": null, + "provenanceaction": { + "classid": "sysimport:crosswalk:repository", + "classname": "sysimport:crosswalk:repository", + "schemeid": "dnet:provenanceActions" + } + } + } + ], + "title": [ + { + "value": "Using biological effects tools to define Good Environmental Status under the European Union Marine Strategy Framework Directive", + "qualifier": { + "classid": "main title", + "classname": "main title", + "schemeid": "dnet:dataCite_title" + } + } + ], + "relevantdate": [ + { + "value": "2018-06-20T11:21:46Z", + "qualifier": { + "classid": "UNKNOWN", + "classname": "UNKNOWN", + "schemeid": "dnet:dataCite_date" + } + } + ], + "description": [ + "The use of biological effects tools offer enormous potential to meet the challenges outlined by the European Union Marine Strategy Framework Directive (MSFD) whereby Member States are required to develop a robust set of tools for defining 11 qualitative descriptors of Good Environmental Status (GES), such as demonstrating that \"Concentrations of contaminants are at levels not giving rise to pollution effects\" (GES Descriptor 8). This paper discusses the combined approach of monitoring chemical contaminant levels, along side biological effect measurements relating to the effect of pollutants, for undertaking assessments of GES across European marine regions. We outline the minimum standards that biological effects tools should meet if they are to be used for defining GES in relation to Descriptor 8 and describe the current international initiatives underway to develop assessment criteria for these biological effects techniques. Crown Copyright (C) 2010 Published by Elsevier Ltd. All rights reserved." + ], + "dateofacceptance": null, + "publisher": { + "name": "PoznaƄ Supercomputing and Networking Center" + }, + "embargoenddate": null, + "source": [], + "fulltext": [], + "format": [], + "contributor": [ + "Generation Service" + ], + "resourcetype": { + "classid": "RO-crate", + "classname": "RO-crate", + "schemeid": "dnet:dataCite_resource", + "schemename": "dnet:dataCite_resource" + }, + "coverage": [], + "bestaccessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemeid": "dnet:access_modes", + "schemename": "dnet:access_modes" + }, + "context": [], + "externalReference": [], + "instance": [ + { + "license": null, + "accessright": { + "classid": "OPEN", + "classname": "Open Access", + "schemeid": "dnet:access_modes", + "openAccessRoute": null + }, + "instancetype": { + "classid": "other research product", + "classname": "other research product", + "schemeid": "dnet:publication_resource" + }, + "hostedby": { + "key": "10|fairsharing_::1b69ebedb522700034547abc5652ffac", + "value": "ROHub" + }, + "url": null, + "distributionlocation": null, + "collectedfrom": { + "key": "10|fairsharing_::1b69ebedb522700034547abc5652ffac", + "value": "ROHub" + }, + "pid": [ + { + "value": "https://w3id.org/ro-id/0ab171a7-45c5-4194-82d4-850955504bca", + "qualifier": { + "classid": "w3id", + "classname": "w3id.org", + "schemeid": "dnet:pid_types" + } + } + ], + "alternateIdentifier": [], + "dateofacceptance": null, + "processingchargeamount": null, + "processingchargecurrency": null, + "refereed": { + "classid": "UNKNOWN", + "classname": "Unknown", + "schemeid": "dnet:review_levels" + }, + "measures": null + } + ], + "eoscifguidelines": null, + "contactperson": [], + "contactgroup": [], + "tool": [] +} diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json index 704c5ad4d..dcc893093 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_1.json @@ -1 +1,28 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]} \ No newline at end of file +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "resuttype": "publication", + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2011.03.013" + }, + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ], + "collectedfrom": [ + { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json index a1744e84e..b6aee7045 100644 --- a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_2.json @@ -1 +1,28 @@ -{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]} \ No newline at end of file +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "resuttype": "publication", + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2011.03.013" + }, + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ], + "collectedfrom": [ + { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", + "value": "Repository A" + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json new file mode 100644 index 000000000..6d33568f4 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_3.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json new file mode 100644 index 000000000..6617fe15f --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_4.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json new file mode 100644 index 000000000..700a10046 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_5.json @@ -0,0 +1 @@ +{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json new file mode 100644 index 000000000..83bc0cd20 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi1.json @@ -0,0 +1,33 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "instance": [ + { + "collectedfrom": { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + } + ] + }, + { + "pid": [ + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json new file mode 100644 index 000000000..5c73fc3c7 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi2.json @@ -0,0 +1,37 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "instance": [ + { + "collectedfrom": { + "key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", + "value": "Crossref" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + } + ] + }, + { + "collectedfrom": { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central" + }, + "pid": [ + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi3.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi3.json new file mode 100644 index 000000000..b1ea01f60 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi3.json @@ -0,0 +1,37 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "instance": [ + { + "collectedfrom": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + } + ] + }, + { + "collectedfrom": { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central" + }, + "pid": [ + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi4.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi4.json new file mode 100644 index 000000000..764c510a8 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi4.json @@ -0,0 +1,37 @@ +{ + "id": "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", + "instance": [ + { + "collectedfrom": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + }, + { + "qualifier": {"classid": "handle"}, + "value": "11012/83840" + } + ] + }, + { + "collectedfrom": { + "key": "10|opendoar____::2852", + "value": "Digital library of Brno University of Technology" + }, + "pid": [ + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + }, + { + "qualifier": {"classid": "handle"}, + "value": "11012/83840" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi5.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi5.json new file mode 100644 index 000000000..816f0dcb6 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_doi5.json @@ -0,0 +1,37 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "instance": [ + { + "collectedfrom": { + "key": "10|opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "Zenodo" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.5281/zenodo.5121485" + } + ] + }, + { + "collectedfrom": { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central" + }, + "pid": [ + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_openapc.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_openapc.json new file mode 100644 index 000000000..f06ac1822 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_openapc.json @@ -0,0 +1,31 @@ +{ + "id": "50|openapc_____::000023f9cb6e3a247c764daec4273cbc", + "resuttype": { + "classid": "publication" + }, + "instance": [ + { + "collectedfrom": { + "key": "10|apc_________::e2b1600b229fc30663c8a1f662debddf", + "value": "OpenAPC Global Initiative" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + }, + { + "qualifier": {"classid": "pmid"}, + "value": "25811027" + } + ], + "url":["https://doi.org/10.1155/2015/439379"] + } + ] +} + + diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json new file mode 100644 index 000000000..537719fc4 --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc1.json @@ -0,0 +1,17 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "pid": [ + { + "qualifier": {"classid": "urn"}, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": {"classid": "scp-number"}, + "value": "79953761260" + }, + { + "qualifier": {"classid": "pmc"}, + "value": "21459329" + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc2.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc2.json new file mode 100644 index 000000000..e7d49eebb --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_pmc2.json @@ -0,0 +1,21 @@ +{ + "id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "instance": [ + { + "collectedfrom": { + "key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c", + "value": "Europe PubMed Central" + }, + "pid": [ + { + "qualifier": {"classid": "doi"}, + "value": "10.1016/j.cmet.2010.03.013" + }, + { + "qualifier":{"classid":"pmc"}, + "value":"21459329" + } + ] + } + ] +} \ No newline at end of file diff --git a/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json new file mode 100644 index 000000000..5323ac8bd --- /dev/null +++ b/dhp-common/src/test/resources/eu/dnetlib/dhp/schema/oaf/utils/publication_urn1.json @@ -0,0 +1,23 @@ +{ + "id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", + "pid": [ + { + "qualifier": { + "classid": "urn" + }, + "value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2" + }, + { + "qualifier": { + "classid": "scp-number" + }, + "value": "79953761260" + }, + { + "qualifier": { + "classid": "pmcid" + }, + "value": "21459329" + } + ] +} \ No newline at end of file diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala index 85f5a3082..5da302c54 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/collection/CollectionUtils.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.collection import com.fasterxml.jackson.databind.ObjectMapper import eu.dnetlib.dhp.schema.common.ModelSupport -import eu.dnetlib.dhp.schema.oaf.{Oaf, OafEntity, Relation} +import eu.dnetlib.dhp.schema.oaf.{Entity, Oaf, Entity, Relation} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode} object CollectionUtils { @@ -17,7 +17,7 @@ object CollectionUtils { */ def fixRelations(i: Oaf): List[Oaf] = { - if (i.isInstanceOf[OafEntity]) + if (i.isInstanceOf[Entity]) return List(i) else { val r: Relation = i.asInstanceOf[Relation] @@ -34,10 +34,9 @@ object CollectionUtils { inverse.setRelType(currentRel.getRelType) inverse.setSubRelType(currentRel.getSubReltype) inverse.setRelClass(currentRel.getInverseRelClass) - inverse.setCollectedfrom(r.getCollectedfrom) + inverse.setProvenance(r.getProvenance) inverse.setDataInfo(r.getDataInfo) inverse.setProperties(r.getProperties) - inverse.setLastupdatetimestamp(r.getLastupdatetimestamp) inverse.setValidated(r.getValidated) inverse.setValidationDate(r.getValidationDate) return List(r, inverse) diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala index a59779387..e577d16a0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteModelConstants.scala @@ -2,7 +2,7 @@ package eu.dnetlib.dhp.datacite import eu.dnetlib.dhp.schema.common.ModelConstants import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils -import eu.dnetlib.dhp.schema.oaf.{DataInfo, KeyValue} +import eu.dnetlib.dhp.schema.oaf.{DataInfo, EntityDataInfo, KeyValue} import java.io.InputStream import java.time.format.DateTimeFormatter @@ -72,7 +72,7 @@ object DataciteModelConstants { val DOI_CLASS = "doi" val SUBJ_CLASS = "keywords" val DATACITE_NAME = "Datacite" - val dataInfo: DataInfo = dataciteDataInfo("0.9") + val dataInfo: EntityDataInfo = dataciteDataInfo(0.9f) val DATACITE_COLLECTED_FROM: KeyValue = OafMapperUtils.keyValue(ModelConstants.DATACITE_ID, DATACITE_NAME) @@ -221,13 +221,13 @@ object DataciteModelConstants { Source.fromInputStream(stream).getLines().toList } - def dataciteDataInfo(trust: String): DataInfo = OafMapperUtils.dataInfo( + def dataciteDataInfo(trust: Float): EntityDataInfo = OafMapperUtils.dataInfo( false, + false, + trust, null, false, - false, - ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER, - trust + ModelConstants.PROVENANCE_ACTION_SET_QUALIFIER ) val df_en: DateTimeFormatter = DateTimeFormatter.ofPattern( diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala index a7ad9e2d6..e3bbcb9e0 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/datacite/DataciteToOAFTransformation.scala @@ -1,6 +1,7 @@ package eu.dnetlib.dhp.datacite import com.fasterxml.jackson.databind.ObjectMapper +import com.google.common.collect.Lists import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup import eu.dnetlib.dhp.datacite.DataciteModelConstants._ import eu.dnetlib.dhp.schema.action.AtomicAction @@ -284,27 +285,24 @@ object DataciteToOAFTransformation { } def generateOAFDate(dt: String, q: Qualifier): StructuredProperty = { - OafMapperUtils.structuredProperty(dt, q, null) + OafMapperUtils.structuredProperty(dt, q) } def generateRelation( - sourceId: String, - targetId: String, - relClass: String, - cf: KeyValue, - di: DataInfo + sourceId: String, + targetId: String, + relClass: String, + collectedFrom: KeyValue, + di: DataInfo ): Relation = { - val r = new Relation r.setSource(sourceId) r.setTarget(targetId) r.setRelType(ModelConstants.RESULT_PROJECT) r.setRelClass(relClass) r.setSubRelType(ModelConstants.OUTCOME) - r.setCollectedfrom(List(cf).asJava) - r.setDataInfo(di) + r.setProvenance(Lists.newArrayList(OafMapperUtils.getProvenance(collectedFrom, di))) r - } def get_projectRelation(awardUri: String, sourceId: String): List[Relation] = { @@ -353,10 +351,9 @@ object DataciteToOAFTransformation { val doi_q = OafMapperUtils.qualifier( "doi", "doi", - ModelConstants.DNET_PID_TYPES, ModelConstants.DNET_PID_TYPES ) - val pid = OafMapperUtils.structuredProperty(doi, doi_q, dataInfo) + val pid = OafMapperUtils.structuredProperty(doi, doi_q) result.setPid(List(pid).asJava) // This identifiere will be replaced in a second moment using the PID logic generation @@ -389,7 +386,7 @@ object DataciteToOAFTransformation { ) else null if (ni.nameIdentifier != null && ni.nameIdentifier.isDefined) { - OafMapperUtils.structuredProperty(ni.nameIdentifier.get, q, dataInfo) + OafMapperUtils.authorPid(ni.nameIdentifier.get, q, dataInfo) } else null @@ -397,13 +394,6 @@ object DataciteToOAFTransformation { .asJava ) } - if (c.affiliation.isDefined) - a.setAffiliation( - c.affiliation.get - .filter(af => af.nonEmpty) - .map(af => OafMapperUtils.field(af, dataInfo)) - .asJava - ) a.setRank(idx + 1) a } @@ -420,15 +410,13 @@ object DataciteToOAFTransformation { .map(t => { if (t.titleType.isEmpty) { OafMapperUtils - .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER, null) + .structuredProperty(t.title.get, ModelConstants.MAIN_TITLE_QUALIFIER) } else { OafMapperUtils.structuredProperty( t.title.get, t.titleType.get, t.titleType.get, - ModelConstants.DNET_DATACITE_TITLE, - ModelConstants.DNET_DATACITE_TITLE, - null + ModelConstants.DNET_DATACITE_TITLE ) } }) @@ -449,46 +437,40 @@ object DataciteToOAFTransformation { .map(d => d.get) if (a_date.isDefined) { - if (doi.startsWith("10.14457")) - result.setEmbargoenddate( - OafMapperUtils.field(fix_thai_date(a_date.get, "[yyyy-MM-dd]"), null) - ) - else - result.setEmbargoenddate(OafMapperUtils.field(a_date.get, null)) + if (doi.startsWith("10.14457")) { + val date = fix_thai_date(a_date.get, "[yyyy-MM-dd]") + result.setEmbargoenddate(date) + } else { + result.setEmbargoenddate(a_date.get) + } } if (i_date.isDefined && i_date.get.isDefined) { if (doi.startsWith("10.14457")) { - result.setDateofacceptance( - OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) - ) + val date = fix_thai_date(i_date.get.get, "[yyyy-MM-dd]") + result.setDateofacceptance(date) result .getInstance() .get(0) - .setDateofacceptance( - OafMapperUtils.field(fix_thai_date(i_date.get.get, "[yyyy-MM-dd]"), null) - ) + .setDateofacceptance(date) } else { - result.setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) - result.getInstance().get(0).setDateofacceptance(OafMapperUtils.field(i_date.get.get, null)) + result.setDateofacceptance(i_date.get.get) + result.getInstance().get(0).setDateofacceptance(i_date.get.get) } } else if (publication_year != null) { + val date = s"01-01-$publication_year" if (doi.startsWith("10.14457")) { - result.setDateofacceptance( - OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) - ) + val date = fix_thai_date(date, "[dd-MM-yyyy]") + result.setDateofacceptance(date) result .getInstance() .get(0) - .setDateofacceptance( - OafMapperUtils.field(fix_thai_date(s"01-01-$publication_year", "[dd-MM-yyyy]"), null) - ) - + .setDateofacceptance(date) } else { - result.setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + result.setDateofacceptance(date) result .getInstance() .get(0) - .setDateofacceptance(OafMapperUtils.field(s"01-01-$publication_year", null)) + .setDateofacceptance(date) } } @@ -519,8 +501,7 @@ object DataciteToOAFTransformation { SUBJ_CLASS, SUBJ_CLASS, ModelConstants.DNET_SUBJECT_TYPOLOGIES, - ModelConstants.DNET_SUBJECT_TYPOLOGIES, - null + dataInfo ) ) .asJava @@ -533,14 +514,14 @@ object DataciteToOAFTransformation { result.setDescription( descriptions .filter(d => d.description.isDefined) - .map(d => OafMapperUtils.field(d.description.get, null)) + .map(d => d.description.get) .filter(s => s != null) .asJava ) val publisher = (json \\ "publisher").extractOrElse[String](null) if (publisher != null) - result.setPublisher(OafMapperUtils.field(publisher, null)) + result.setPublisher(OafMapperUtils.publisher(publisher)) val language: String = (json \\ "language").extractOrElse[String](null) @@ -568,7 +549,6 @@ object DataciteToOAFTransformation { a.setClassid(q.getClassid) a.setClassname(q.getClassname) a.setSchemeid(q.getSchemeid) - a.setSchemename(q.getSchemename) a }) @@ -598,7 +578,7 @@ object DataciteToOAFTransformation { ) ) if (license.isDefined) - instance.setLicense(OafMapperUtils.field(license.get, null)) + instance.setLicense(OafMapperUtils.license(license.get)) } val awardUris: List[String] = for { @@ -654,7 +634,8 @@ object DataciteToOAFTransformation { ) .map(r => { val rel = new Relation - rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) + + rel.setProvenance(Lists.newArrayList(OafMapperUtils.getProvenance(DATACITE_COLLECTED_FROM, dataInfo))) rel.setDataInfo(dataInfo) val subRelType = subRelTypeMapping(r.relationType).relType @@ -670,8 +651,7 @@ object DataciteToOAFTransformation { rel.setTarget( DHPUtils.generateUnresolvedIdentifier(r.relatedIdentifier, r.relatedIdentifierType) ) - rel.setCollectedfrom(List(DATACITE_COLLECTED_FROM).asJava) - rel.getCollectedfrom.asScala.map(c => c.getValue).toList + rel }) } diff --git a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala index 87116f00a..8ac8b00bf 100644 --- a/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala +++ b/dhp-workflows/dhp-aggregation/src/main/scala/eu/dnetlib/dhp/sx/bio/ebi/SparkCreateBaselineDataFrame.scala @@ -27,7 +27,8 @@ object SparkCreateBaselineDataFrame { def requestBaseLineUpdatePage(maxFile: String): List[(String, String)] = { val data = requestPage("https://ftp.ncbi.nlm.nih.gov/pubmed/updatefiles/") - val result = data.linesWithSeparators.map(l =>l.stripLineEnd) + val result = data.linesWithSeparators + .map(l => l.stripLineEnd) .filter(l => l.startsWith("") diff --git a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala index 24caaa553..d1611300d 100644 --- a/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala +++ b/dhp-workflows/dhp-aggregation/src/test/scala/eu/dnetlib/dhp/sx/bio/BioScholixTest.scala @@ -63,7 +63,9 @@ class BioScholixTest extends AbstractVocabularyTest { val records: String = Source .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pubmed_dump")) .mkString - val r: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList + val r: List[Oaf] = records.linesWithSeparators + .map(l => l.stripLineEnd) + .toList .map(s => mapper.readValue(s, classOf[PMArticle])) .map(a => PubMedToOaf.convert(a, vocabularies)) assertEquals(10, r.size) @@ -173,9 +175,10 @@ class BioScholixTest extends AbstractVocabularyTest { val records: String = Source .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/pdb_dump")) .mkString - records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) + records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.pdbTOOaf(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -194,9 +197,10 @@ class BioScholixTest extends AbstractVocabularyTest { val records: String = Source .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/uniprot_dump")) .mkString - records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) + records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).toList.flatMap(o => BioDBToOAF.uniprotToOAF(o)) assertTrue(result.nonEmpty) result.foreach(r => assertNotNull(r)) @@ -239,9 +243,10 @@ class BioScholixTest extends AbstractVocabularyTest { val records: String = Source .fromInputStream(getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/crossref_links")) .mkString - records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) + records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) - val result: List[Oaf] = records.linesWithSeparators.map(l =>l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList + val result: List[Oaf] = + records.linesWithSeparators.map(l => l.stripLineEnd).map(s => BioDBToOAF.crossrefLinksToOaf(s)).toList assertNotNull(result) assertTrue(result.nonEmpty) @@ -276,14 +281,17 @@ class BioScholixTest extends AbstractVocabularyTest { getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/bio/scholix_resolved") ) .mkString - records.linesWithSeparators.map(l =>l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) + records.linesWithSeparators.map(l => l.stripLineEnd).foreach(s => assertTrue(s.nonEmpty)) implicit lazy val formats: DefaultFormats.type = org.json4s.DefaultFormats - val l: List[ScholixResolved] = records.linesWithSeparators.map(l =>l.stripLineEnd).map { input => - lazy val json = parse(input) - json.extract[ScholixResolved] - }.toList + val l: List[ScholixResolved] = records.linesWithSeparators + .map(l => l.stripLineEnd) + .map { input => + lazy val json = parse(input) + json.extract[ScholixResolved] + } + .toList val result: List[Oaf] = l.map(s => BioDBToOAF.scholixResolvedToOAF(s)) diff --git a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java index 6989ec54b..7a3b51bc0 100644 --- a/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java +++ b/dhp-workflows/dhp-dedup-openaire/src/main/java/eu/dnetlib/dhp/oa/dedup/SparkCreateDedupRecord.java @@ -6,6 +6,9 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVENANCE_DEDUP; import java.io.IOException; +import eu.dnetlib.dhp.schema.oaf.Entity; +import eu.dnetlib.dhp.schema.oaf.common.EntityType; +import eu.dnetlib.dhp.schema.oaf.common.ModelSupport; import org.apache.commons.io.IOUtils; import org.apache.spark.SparkConf; import org.apache.spark.sql.SaveMode; @@ -77,7 +80,7 @@ public class SparkCreateDedupRecord extends AbstractSparkAction { final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, subEntity); final String entityPath = DedupUtility.createEntityPath(graphBasePath, subEntity); - final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); + final Class clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity)); final DataInfo dataInfo = getDataInfo(dedupConf); DedupRecordFactory .createDedupRecord(spark, dataInfo, mergeRelPath, entityPath, clazz) diff --git a/pom.xml b/pom.xml index 9b60b9078..42195ddfd 100644 --- a/pom.xml +++ b/pom.xml @@ -807,7 +807,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [3.15.0] + [4.0.0-SNAPSHOT] [4.0.3] [6.0.5] [3.1.6]