From ebf53a1616b256de90aa62255d5c4e2b13d34237 Mon Sep 17 00:00:00 2001 From: Claudio Atzori Date: Wed, 15 Sep 2021 16:10:37 +0200 Subject: [PATCH] added cleaning for relation fields: subRelType & relClass according to dedicated vocabs --- .../dhp/oa/graph/clean/CleaningRuleMap.java | 31 ++++++++--- .../clean/GraphCleaningFunctionsTest.java | 55 ++++++++++++------- .../dnetlib/dhp/oa/graph/clean/relation.json | 10 ++++ .../dnetlib/dhp/oa/graph/clean/synonyms.txt | 12 +++- .../eu/dnetlib/dhp/oa/graph/clean/terms.txt | 39 ++++++++++++- pom.xml | 2 +- 6 files changed, 117 insertions(+), 32 deletions(-) create mode 100644 dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json diff --git a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java index 95aa749b2..7a3583289 100644 --- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java +++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/clean/CleaningRuleMap.java @@ -12,6 +12,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.AccessRight; import eu.dnetlib.dhp.schema.oaf.Country; import eu.dnetlib.dhp.schema.oaf.Qualifier; +import eu.dnetlib.dhp.schema.oaf.Relation; public class CleaningRuleMap extends HashMap, SerializableConsumer> implements Serializable { @@ -24,17 +25,31 @@ public class CleaningRuleMap extends HashMap, SerializableConsumer cleanQualifier(vocabularies, (Qualifier) o)); mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o)); - mapping.put(Country.class, o -> { - final Country c = (Country) o; - if (StringUtils.isBlank(c.getSchemeid())) { - c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); - c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); - } - cleanQualifier(vocabularies, c); - }); + mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o)); + mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o)); return mapping; } + private static void cleanRelation(VocabularyGroup vocabularies, Relation r) { + if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) { + Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType()); + r.setSubRelType(newValue.getClassid()); + } + if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_RELCLASS)) { + Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_RELCLASS, r.getRelClass()); + r.setRelClass(newValue.getClassid()); + } + } + + private static void cleanCountry(VocabularyGroup vocabularies, Country o) { + final Country c = o; + if (StringUtils.isBlank(c.getSchemeid())) { + c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE); + c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE); + } + cleanQualifier(vocabularies, c); + } + private static void cleanQualifier(VocabularyGroup vocabularies, Q q) { if (vocabularies.vocabularyExists(q.getSchemeid())) { Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid()); diff --git a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java index edcd72ab4..42d9f226c 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java +++ b/dhp-workflows/dhp-graph-mapper/src/test/java/eu/dnetlib/dhp/oa/graph/clean/GraphCleaningFunctionsTest.java @@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.lenient; import java.io.IOException; +import java.util.Collection; import java.util.List; import java.util.Set; import java.util.stream.Stream; @@ -16,12 +17,12 @@ import org.junit.jupiter.api.extension.ExtendWith; import org.mockito.Mock; import org.mockito.junit.jupiter.MockitoExtension; +import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.ObjectMapper; import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup; import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.oaf.*; -import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions; import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @@ -29,7 +30,8 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; @ExtendWith(MockitoExtension.class) public class GraphCleaningFunctionsTest { - public static final ObjectMapper MAPPER = new ObjectMapper(); + public static final ObjectMapper MAPPER = new ObjectMapper() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); @Mock private ISLookUpService isLookUpService; @@ -49,6 +51,23 @@ public class GraphCleaningFunctionsTest { mapping = CleaningRuleMap.create(vocabularies); } + @Test + void testCleanRelations() throws Exception { + + List lines = IOUtils + .readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json")); + for (String json : lines) { + Relation r_in = MAPPER.readValue(json, Relation.class); + assertNotNull(r_in); + + assertFalse(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_in.getRelClass())); + + Relation r_out = OafCleaner.apply(r_in, mapping); + assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass())); + assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType())); + } + } + @Test void testCleaning() throws Exception { @@ -87,7 +106,7 @@ public class GraphCleaningFunctionsTest { p_out .getPid() .stream() - .map(p -> p.getQualifier()) + .map(StructuredProperty::getQualifier) .allMatch(q -> pidTerms.contains(q.getClassid()))); List poi = p_out.getInstance(); @@ -101,8 +120,8 @@ public class GraphCleaningFunctionsTest { assertEquals(2, poii.getPid().size()); assertTrue( - poii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); - assertTrue(poii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x"))); + assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd"))); assertNotNull(poii.getAlternateIdentifier()); assertEquals(2, poii.getAlternateIdentifier().size()); @@ -111,16 +130,12 @@ public class GraphCleaningFunctionsTest { poii .getAlternateIdentifier() .stream() - .filter(s -> s.getValue().equals("10.1007/s109090161569x")) - .findFirst() - .isPresent()); + .anyMatch(s -> s.getValue().equals("10.1007/s109090161569x"))); assertTrue( poii .getAlternateIdentifier() .stream() - .filter(s -> s.getValue().equals("10.1009/qwerty")) - .findFirst() - .isPresent()); + .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out); @@ -142,8 +157,8 @@ public class GraphCleaningFunctionsTest { assertEquals(2, pcii.getPid().size()); assertTrue( - pcii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent()); - assertTrue(pcii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent()); + pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x"))); + assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd"))); assertNotNull(pcii.getAlternateIdentifier()); assertEquals(1, pcii.getAlternateIdentifier().size()); @@ -151,9 +166,7 @@ public class GraphCleaningFunctionsTest { pcii .getAlternateIdentifier() .stream() - .filter(s -> s.getValue().equals("10.1009/qwerty")) - .findFirst() - .isPresent()); + .anyMatch(s -> s.getValue().equals("10.1009/qwerty"))); getAuthorPids(p_cleaned).forEach(pid -> { System.out @@ -172,17 +185,17 @@ public class GraphCleaningFunctionsTest { return pub .getAuthor() .stream() - .map(a -> a.getPid()) - .flatMap(p -> p.stream()) - .map(s -> s.getQualifier()); + .map(Author::getPid) + .flatMap(Collection::stream) + .map(StructuredProperty::getQualifier); } private Stream getAuthorPids(Result pub) { return pub .getAuthor() .stream() - .map(a -> a.getPid()) - .flatMap(p -> p.stream()); + .map(Author::getPid) + .flatMap(Collection::stream); } private List vocs() throws IOException { diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json new file mode 100644 index 000000000..97764de00 --- /dev/null +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/relation.json @@ -0,0 +1,10 @@ +{"relType":"resultResult","subRelType":"citation","relClass":"cites","source":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","target":"50|openaire____::007a4870b31056f89b768cf508e1538e"} +{"relType":"resultResult","subRelType":"citation","relClass":"isCitedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"part","relClass":"isPartOf","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"part","relClass":"hasPart","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"review","relClass":"isReviewedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"review","relClass":"reviews","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"relationship","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} +{"relType":"resultResult","subRelType":"publicationDataset","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"} \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt index 729296522..79dc7cd2d 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt @@ -1231,4 +1231,14 @@ dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査 dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り) dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り) dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り) -dnet:review_levels @=@ 0001 @=@ 査読論文 \ No newline at end of file +dnet:review_levels @=@ 0001 @=@ 査読論文 +dnet:relation_relClass @=@ Cites @=@ cites +dnet:relation_relClass @=@ IsCitedBy @=@ isCitedBy +dnet:relation_relClass @=@ HasPart @=@ hasPart +dnet:relation_relClass @=@ IsPartOf @=@ isPartOf +dnet:relation_relClass @=@ IsReviewedBy @=@ isReviewedBy +dnet:relation_relClass @=@ Reviews @=@ reviews +dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo +dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy +dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo +dnet:relation_subRelType @=@ relationship @=@ publicationDataset \ No newline at end of file diff --git a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt index ba47aaf5c..bb1e5fbf9 100644 --- a/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt +++ b/dhp-workflows/dhp-graph-mapper/src/test/resources/eu/dnetlib/dhp/oa/graph/clean/terms.txt @@ -1079,4 +1079,41 @@ dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed -dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed \ No newline at end of file +dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed +dnet:relation_relClass @=@ dnet:relation_relClass @=@ Cites @=@ Cites +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCitedBy @=@ IsCitedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasPart @=@ HasPart +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPartOf @=@ IsPartOf +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsReviewedBy @=@ IsReviewedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ Reviews @=@ Reviews +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementTo @=@ IsSupplementTo +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementedBy @=@ IsSupplementedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsRelatedTo @=@ IsRelatedTo +dnet:relation_relClass @=@ dnet:relation_relClass @=@ Compiles @=@ Compiles +dnet:relation_relClass @=@ dnet:relation_relClass @=@ Continues @=@ Continues +dnet:relation_relClass @=@ dnet:relation_relClass @=@ Documents @=@ Documents +dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasAmongTopNSimilarDocuments @=@ HasAmongTopNSimilarDocuments +dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasVersion @=@ HasVersion +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsAmongTopNSimilarDocuments @=@ IsAmongTopNSimilarDocuments +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCompiledBy @=@ IsCompiledBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsContinuedBy @=@ IsContinuedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDerivedFrom @=@ IsDerivedFrom +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDocumentedBy @=@ IsDocumentedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsNewVersionOf @=@ IsNewVersionOf +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsObsoletedBy @=@ IsObsoletedBy +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsOriginalFormOf @=@ IsOriginalFormOf +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSourceOf @=@ IsSourceOf +dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsVariantFormOf @=@ IsVariantFormOf +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ affiliation @=@ affiliation +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ citation @=@ citation +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ dedup @=@ dedup +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ outcome @=@ outcome +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ part @=@ part +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ participation @=@ participation +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ provision @=@ provision +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relationship +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement +dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version \ No newline at end of file diff --git a/pom.xml b/pom.xml index 99525ef85..61b0ad873 100644 --- a/pom.xml +++ b/pom.xml @@ -753,7 +753,7 @@ 3.3.3 3.4.2 [2.12,3.0) - [2.7.17] + [2.7.18] [4.0.3] [6.0.5] [3.1.6]