implementation of the whitelist for similarity relations #144

Merged
claudio.atzori merged 3 commits from dedup_whitelist into beta 2021-09-27 16:47:41 +02:00
6 changed files with 117 additions and 32 deletions
Showing only changes of commit 7fa60e166e - Show all commits

View File

@ -12,6 +12,7 @@ import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.AccessRight;
import eu.dnetlib.dhp.schema.oaf.Country;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Object>> implements Serializable {
@ -24,17 +25,31 @@ public class CleaningRuleMap extends HashMap<Class<?>, SerializableConsumer<Obje
CleaningRuleMap mapping = new CleaningRuleMap();
mapping.put(Qualifier.class, o -> cleanQualifier(vocabularies, (Qualifier) o));
mapping.put(AccessRight.class, o -> cleanQualifier(vocabularies, (AccessRight) o));
mapping.put(Country.class, o -> {
final Country c = (Country) o;
if (StringUtils.isBlank(c.getSchemeid())) {
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
}
cleanQualifier(vocabularies, c);
});
mapping.put(Country.class, o -> cleanCountry(vocabularies, (Country) o));
mapping.put(Relation.class, o -> cleanRelation(vocabularies, (Relation) o));
return mapping;
}
private static void cleanRelation(VocabularyGroup vocabularies, Relation r) {
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_SUBRELTYPE)) {
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_SUBRELTYPE, r.getSubRelType());
r.setSubRelType(newValue.getClassid());
}
if (vocabularies.vocabularyExists(ModelConstants.DNET_RELATION_RELCLASS)) {
Qualifier newValue = vocabularies.lookup(ModelConstants.DNET_RELATION_RELCLASS, r.getRelClass());
r.setRelClass(newValue.getClassid());
}
}
private static void cleanCountry(VocabularyGroup vocabularies, Country o) {
final Country c = o;
if (StringUtils.isBlank(c.getSchemeid())) {
c.setSchemeid(ModelConstants.DNET_COUNTRY_TYPE);
c.setSchemename(ModelConstants.DNET_COUNTRY_TYPE);
}
cleanQualifier(vocabularies, c);
}
private static <Q extends Qualifier> void cleanQualifier(VocabularyGroup vocabularies, Q q) {
if (vocabularies.vocabularyExists(q.getSchemeid())) {
Qualifier newValue = vocabularies.lookup(q.getSchemeid(), q.getClassid());

View File

@ -5,6 +5,7 @@ import static org.junit.jupiter.api.Assertions.*;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.stream.Stream;
@ -16,12 +17,12 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ -29,7 +30,8 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class GraphCleaningFunctionsTest {
public static final ObjectMapper MAPPER = new ObjectMapper();
public static final ObjectMapper MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Mock
private ISLookUpService isLookUpService;
@ -49,6 +51,23 @@ public class GraphCleaningFunctionsTest {
mapping = CleaningRuleMap.create(vocabularies);
}
@Test
void testCleanRelations() throws Exception {
List<String> lines = IOUtils
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
for (String json : lines) {
Relation r_in = MAPPER.readValue(json, Relation.class);
assertNotNull(r_in);
assertFalse(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_in.getRelClass()));
Relation r_out = OafCleaner.apply(r_in, mapping);
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass()));
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType()));
}
}
@Test
void testCleaning() throws Exception {
@ -87,7 +106,7 @@ public class GraphCleaningFunctionsTest {
p_out
.getPid()
.stream()
.map(p -> p.getQualifier())
.map(StructuredProperty::getQualifier)
.allMatch(q -> pidTerms.contains(q.getClassid())));
List<Instance> poi = p_out.getInstance();
@ -101,8 +120,8 @@ public class GraphCleaningFunctionsTest {
assertEquals(2, poii.getPid().size());
assertTrue(
poii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent());
assertTrue(poii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent());
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
assertNotNull(poii.getAlternateIdentifier());
assertEquals(2, poii.getAlternateIdentifier().size());
@ -111,16 +130,12 @@ public class GraphCleaningFunctionsTest {
poii
.getAlternateIdentifier()
.stream()
.filter(s -> s.getValue().equals("10.1007/s109090161569x"))
.findFirst()
.isPresent());
.anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
assertTrue(
poii
.getAlternateIdentifier()
.stream()
.filter(s -> s.getValue().equals("10.1009/qwerty"))
.findFirst()
.isPresent());
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out);
@ -142,8 +157,8 @@ public class GraphCleaningFunctionsTest {
assertEquals(2, pcii.getPid().size());
assertTrue(
pcii.getPid().stream().filter(s -> s.getValue().equals("10.1007/s109090161569x")).findFirst().isPresent());
assertTrue(pcii.getPid().stream().filter(s -> s.getValue().equals("10.1008/abcd")).findFirst().isPresent());
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
assertNotNull(pcii.getAlternateIdentifier());
assertEquals(1, pcii.getAlternateIdentifier().size());
@ -151,9 +166,7 @@ public class GraphCleaningFunctionsTest {
pcii
.getAlternateIdentifier()
.stream()
.filter(s -> s.getValue().equals("10.1009/qwerty"))
.findFirst()
.isPresent());
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
getAuthorPids(p_cleaned).forEach(pid -> {
System.out
@ -172,17 +185,17 @@ public class GraphCleaningFunctionsTest {
return pub
.getAuthor()
.stream()
.map(a -> a.getPid())
.flatMap(p -> p.stream())
.map(s -> s.getQualifier());
.map(Author::getPid)
.flatMap(Collection::stream)
.map(StructuredProperty::getQualifier);
}
private Stream<StructuredProperty> getAuthorPids(Result pub) {
return pub
.getAuthor()
.stream()
.map(a -> a.getPid())
.flatMap(p -> p.stream());
.map(Author::getPid)
.flatMap(Collection::stream);
}
private List<String> vocs() throws IOException {

View File

@ -0,0 +1,10 @@
{"relType":"resultResult","subRelType":"citation","relClass":"cites","source":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","target":"50|openaire____::007a4870b31056f89b768cf508e1538e"}
{"relType":"resultResult","subRelType":"citation","relClass":"isCitedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"supplement","relClass":"isSupplementedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"part","relClass":"isPartOf","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"part","relClass":"hasPart","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"review","relClass":"isReviewedBy","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"review","relClass":"reviews","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"relationship","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}
{"relType":"resultResult","subRelType":"publicationDataset","relClass":"isRelatedTo","source":"50|openaire____::007a4870b31056f89b768cf508e1538e","target":"50|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556"}

View File

@ -1231,4 +1231,14 @@ dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-学術雑誌論文(査
dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-紀要論文(査読有り)
dnet:review_levels @=@ 0001 @=@ 印刷物/電子媒体-雑誌記事(査読有り)
dnet:review_levels @=@ 0001 @=@ 原著論文(査読有り)
dnet:review_levels @=@ 0001 @=@ 査読論文
dnet:review_levels @=@ 0001 @=@ 査読論文
dnet:relation_relClass @=@ Cites @=@ cites
dnet:relation_relClass @=@ IsCitedBy @=@ isCitedBy
dnet:relation_relClass @=@ HasPart @=@ hasPart
dnet:relation_relClass @=@ IsPartOf @=@ isPartOf
dnet:relation_relClass @=@ IsReviewedBy @=@ isReviewedBy
dnet:relation_relClass @=@ Reviews @=@ reviews
dnet:relation_relClass @=@ IsSupplementTo @=@ isSupplementTo
dnet:relation_relClass @=@ IsSupplementedBy @=@ isSupplementedBy
dnet:relation_relClass @=@ IsRelatedTo @=@ isRelatedTo
dnet:relation_subRelType @=@ relationship @=@ publicationDataset

View File

@ -1079,4 +1079,41 @@ dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/DATASET/IS_SUPPLEMENTED
dnet:topic_types @=@ dnet:topic_types @=@ ENRICH/MISSING/AUTHOR/ORCID @=@ An Open Researcher and Contributor ID (ORCID) that can be associated to an author of your publications
dnet:review_levels @=@ dnet:review_levels @=@ 0000 @=@ Unknown
dnet:review_levels @=@ dnet:review_levels @=@ 0002 @=@ nonPeerReviewed
dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed
dnet:review_levels @=@ dnet:review_levels @=@ 0001 @=@ peerReviewed
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Cites @=@ Cites
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCitedBy @=@ IsCitedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasPart @=@ HasPart
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPartOf @=@ IsPartOf
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsReviewedBy @=@ IsReviewedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Reviews @=@ Reviews
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementTo @=@ IsSupplementTo
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSupplementedBy @=@ IsSupplementedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsRelatedTo @=@ IsRelatedTo
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Compiles @=@ Compiles
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Continues @=@ Continues
dnet:relation_relClass @=@ dnet:relation_relClass @=@ Documents @=@ Documents
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasAmongTopNSimilarDocuments @=@ HasAmongTopNSimilarDocuments
dnet:relation_relClass @=@ dnet:relation_relClass @=@ HasVersion @=@ HasVersion
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsAmongTopNSimilarDocuments @=@ IsAmongTopNSimilarDocuments
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsCompiledBy @=@ IsCompiledBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsContinuedBy @=@ IsContinuedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDerivedFrom @=@ IsDerivedFrom
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsDocumentedBy @=@ IsDocumentedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsNewVersionOf @=@ IsNewVersionOf
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsObsoletedBy @=@ IsObsoletedBy
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsOriginalFormOf @=@ IsOriginalFormOf
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsPreviousVersionOf @=@ IsPreviousVersionOf
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsSourceOf @=@ IsSourceOf
dnet:relation_relClass @=@ dnet:relation_relClass @=@ IsVariantFormOf @=@ IsVariantFormOf
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ affiliation @=@ affiliation
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ citation @=@ citation
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ dedup @=@ dedup
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ outcome @=@ outcome
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ part @=@ part
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ participation @=@ participation
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ provision @=@ provision
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ relationship @=@ relationship
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ review @=@ review
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ similarity @=@ similarity
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ supplement @=@ supplement
dnet:relation_subRelType @=@ dnet:relation_subRelType @=@ version @=@ version

View File

@ -753,7 +753,7 @@
<mockito-core.version>3.3.3</mockito-core.version>
<mongodb.driver.version>3.4.2</mongodb.driver.version>
<vtd.version>[2.12,3.0)</vtd.version>
<dhp-schemas.version>[2.7.17]</dhp-schemas.version>
<dhp-schemas.version>[2.7.18]</dhp-schemas.version>
<dnet-actionmanager-api.version>[4.0.3]</dnet-actionmanager-api.version>
<dnet-actionmanager-common.version>[6.0.5]</dnet-actionmanager-common.version>
<dnet-openaire-broker-common.version>[3.1.6]</dnet-openaire-broker-common.version>