2020-06-09 17:20:40 +02:00
|
|
|
|
|
|
|
package eu.dnetlib.dhp.oa.graph.clean;
|
|
|
|
|
2020-06-09 19:52:53 +02:00
|
|
|
import static org.junit.jupiter.api.Assertions.*;
|
2020-06-09 17:20:40 +02:00
|
|
|
import static org.mockito.Mockito.lenient;
|
|
|
|
|
|
|
|
import java.io.IOException;
|
2021-09-15 16:10:37 +02:00
|
|
|
import java.util.Collection;
|
2020-06-09 17:20:40 +02:00
|
|
|
import java.util.List;
|
2020-06-09 19:52:53 +02:00
|
|
|
import java.util.Set;
|
2021-11-17 14:17:22 +01:00
|
|
|
import java.util.stream.Collectors;
|
2020-06-09 19:52:53 +02:00
|
|
|
import java.util.stream.Stream;
|
2020-06-09 17:20:40 +02:00
|
|
|
|
|
|
|
import org.apache.commons.io.IOUtils;
|
2021-11-21 16:35:22 +01:00
|
|
|
import org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.MappableBlock;
|
|
|
|
import org.junit.jupiter.api.Assertions;
|
2020-06-09 17:20:40 +02:00
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
import org.junit.jupiter.api.extension.ExtendWith;
|
|
|
|
import org.mockito.Mock;
|
|
|
|
import org.mockito.junit.jupiter.MockitoExtension;
|
|
|
|
|
2021-09-15 16:10:37 +02:00
|
|
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
2020-06-09 17:20:40 +02:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
2021-01-25 15:43:04 +01:00
|
|
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
2021-03-23 09:28:58 +01:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
2020-11-24 18:34:03 +01:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
2021-05-10 16:43:39 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.GraphCleaningFunctions;
|
2020-06-09 17:20:40 +02:00
|
|
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
|
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
|
|
|
|
|
|
@ExtendWith(MockitoExtension.class)
|
2021-05-10 16:43:39 +02:00
|
|
|
public class GraphCleaningFunctionsTest {
|
2020-06-09 17:20:40 +02:00
|
|
|
|
2021-09-15 16:10:37 +02:00
|
|
|
public static final ObjectMapper MAPPER = new ObjectMapper()
|
|
|
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
2020-06-09 19:52:53 +02:00
|
|
|
|
2020-06-09 17:20:40 +02:00
|
|
|
@Mock
|
|
|
|
private ISLookUpService isLookUpService;
|
|
|
|
|
|
|
|
private VocabularyGroup vocabularies;
|
|
|
|
|
2020-06-13 13:06:04 +02:00
|
|
|
private CleaningRuleMap mapping;
|
2020-06-09 17:20:40 +02:00
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() throws ISLookUpException, IOException {
|
|
|
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
|
|
|
lenient()
|
|
|
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
|
|
|
.thenReturn(synonyms());
|
|
|
|
|
|
|
|
vocabularies = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
2020-06-13 13:06:04 +02:00
|
|
|
mapping = CleaningRuleMap.create(vocabularies);
|
2020-06-09 17:20:40 +02:00
|
|
|
}
|
|
|
|
|
2021-09-15 16:10:37 +02:00
|
|
|
@Test
|
|
|
|
void testCleanRelations() throws Exception {
|
|
|
|
|
|
|
|
List<String> lines = IOUtils
|
|
|
|
.readLines(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/relation.json"));
|
|
|
|
for (String json : lines) {
|
|
|
|
Relation r_in = MAPPER.readValue(json, Relation.class);
|
|
|
|
assertNotNull(r_in);
|
|
|
|
|
|
|
|
assertFalse(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_in.getRelClass()));
|
|
|
|
|
|
|
|
Relation r_out = OafCleaner.apply(r_in, mapping);
|
|
|
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_RELCLASS).contains(r_out.getRelClass()));
|
|
|
|
assertTrue(vocabularies.getTerms(ModelConstants.DNET_RELATION_SUBRELTYPE).contains(r_out.getSubRelType()));
|
2021-12-01 15:49:15 +01:00
|
|
|
|
|
|
|
assertEquals("iis", r_out.getDataInfo().getProvenanceaction().getClassid());
|
|
|
|
assertEquals("Inferred by OpenAIRE", r_out.getDataInfo().getProvenanceaction().getClassname());
|
2021-09-15 16:10:37 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-12-03 09:08:12 +01:00
|
|
|
@Test
|
|
|
|
void testFilter_false() throws Exception {
|
|
|
|
|
|
|
|
assertNotNull(vocabularies);
|
|
|
|
assertNotNull(mapping);
|
|
|
|
|
|
|
|
String json = IOUtils
|
|
|
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_invisible.json"));
|
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
|
|
|
|
assertTrue(p_in instanceof Result);
|
|
|
|
assertTrue(p_in instanceof Publication);
|
|
|
|
|
|
|
|
assertEquals(false, GraphCleaningFunctions.filter(p_in));
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
void testFilter_true() throws Exception {
|
|
|
|
|
|
|
|
assertNotNull(vocabularies);
|
|
|
|
assertNotNull(mapping);
|
|
|
|
|
|
|
|
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
|
|
|
|
assertTrue(p_in instanceof Result);
|
|
|
|
assertTrue(p_in instanceof Publication);
|
|
|
|
|
|
|
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
void testFilter_missing_invisible() throws Exception {
|
|
|
|
|
|
|
|
assertNotNull(vocabularies);
|
|
|
|
assertNotNull(mapping);
|
|
|
|
|
|
|
|
String json = IOUtils
|
|
|
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result_missing_invisible.json"));
|
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
|
|
|
|
assertTrue(p_in instanceof Result);
|
|
|
|
assertTrue(p_in instanceof Publication);
|
|
|
|
|
|
|
|
assertEquals(true, GraphCleaningFunctions.filter(p_in));
|
|
|
|
}
|
|
|
|
|
2020-06-09 17:20:40 +02:00
|
|
|
@Test
|
2021-08-11 12:13:22 +02:00
|
|
|
void testCleaning() throws Exception {
|
2020-06-09 17:20:40 +02:00
|
|
|
|
2020-06-13 13:06:04 +02:00
|
|
|
assertNotNull(vocabularies);
|
|
|
|
assertNotNull(mapping);
|
2020-06-09 19:52:53 +02:00
|
|
|
|
2020-06-09 17:20:40 +02:00
|
|
|
String json = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/result.json"));
|
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
|
2020-07-08 17:53:35 +02:00
|
|
|
assertNull(p_in.getBestaccessright());
|
|
|
|
|
2020-06-18 19:37:25 +02:00
|
|
|
assertTrue(p_in instanceof Result);
|
|
|
|
assertTrue(p_in instanceof Publication);
|
|
|
|
|
2021-05-10 16:43:39 +02:00
|
|
|
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
2020-06-09 17:20:40 +02:00
|
|
|
|
2020-06-09 19:52:53 +02:00
|
|
|
assertNotNull(p_out);
|
|
|
|
|
2020-07-30 17:03:53 +02:00
|
|
|
assertNotNull(p_out.getPublisher());
|
|
|
|
assertNull(p_out.getPublisher().getValue());
|
2021-04-13 14:32:41 +02:00
|
|
|
|
2020-06-15 18:32:24 +02:00
|
|
|
assertEquals("und", p_out.getLanguage().getClassid());
|
|
|
|
assertEquals("Undetermined", p_out.getLanguage().getClassname());
|
2020-06-09 19:52:53 +02:00
|
|
|
|
2020-06-18 19:37:25 +02:00
|
|
|
assertEquals("DE", p_out.getCountry().get(0).getClassid());
|
|
|
|
assertEquals("Germany", p_out.getCountry().get(0).getClassname());
|
|
|
|
|
2020-06-09 19:52:53 +02:00
|
|
|
assertEquals("0018", p_out.getInstance().get(0).getInstancetype().getClassid());
|
|
|
|
assertEquals("Annotation", p_out.getInstance().get(0).getInstancetype().getClassname());
|
|
|
|
|
2021-12-09 16:44:28 +01:00
|
|
|
assertEquals("0027", p_out.getInstance().get(1).getInstancetype().getClassid());
|
|
|
|
assertEquals("Model", p_out.getInstance().get(1).getInstancetype().getClassname());
|
|
|
|
|
|
|
|
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassid());
|
|
|
|
assertEquals("xyz", p_out.getInstance().get(2).getInstancetype().getClassname());
|
2021-12-09 13:57:53 +01:00
|
|
|
|
2020-06-09 19:52:53 +02:00
|
|
|
assertEquals("CLOSED", p_out.getInstance().get(0).getAccessright().getClassid());
|
|
|
|
assertEquals("Closed Access", p_out.getInstance().get(0).getAccessright().getClassname());
|
|
|
|
|
2021-03-23 09:28:58 +01:00
|
|
|
Set<String> pidTerms = vocabularies.getTerms(ModelConstants.DNET_PID_TYPES);
|
2020-06-09 19:52:53 +02:00
|
|
|
assertTrue(
|
|
|
|
p_out
|
|
|
|
.getPid()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.map(StructuredProperty::getQualifier)
|
2020-06-09 19:52:53 +02:00
|
|
|
.allMatch(q -> pidTerms.contains(q.getClassid())));
|
2020-06-09 17:20:40 +02:00
|
|
|
|
2021-03-25 11:07:59 +01:00
|
|
|
List<Instance> poi = p_out.getInstance();
|
|
|
|
assertNotNull(poi);
|
2021-12-09 16:44:28 +01:00
|
|
|
assertEquals(3, poi.size());
|
2021-03-25 11:07:59 +01:00
|
|
|
|
|
|
|
final Instance poii = poi.get(0);
|
|
|
|
assertNotNull(poii);
|
|
|
|
assertNotNull(poii.getPid());
|
|
|
|
|
|
|
|
assertEquals(2, poii.getPid().size());
|
|
|
|
|
|
|
|
assertTrue(
|
2021-09-15 16:10:37 +02:00
|
|
|
poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
|
|
|
assertTrue(poii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
2021-03-25 11:07:59 +01:00
|
|
|
|
|
|
|
assertNotNull(poii.getAlternateIdentifier());
|
|
|
|
assertEquals(2, poii.getAlternateIdentifier().size());
|
|
|
|
|
|
|
|
assertTrue(
|
|
|
|
poii
|
|
|
|
.getAlternateIdentifier()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
2021-03-25 11:07:59 +01:00
|
|
|
assertTrue(
|
|
|
|
poii
|
|
|
|
.getAlternateIdentifier()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
2021-03-25 11:07:59 +01:00
|
|
|
|
2021-11-17 14:17:22 +01:00
|
|
|
assertEquals(5, p_out.getTitle().size());
|
|
|
|
|
2021-12-09 16:44:28 +01:00
|
|
|
Publication p_cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
2021-04-13 14:32:41 +02:00
|
|
|
|
2021-11-17 14:17:22 +01:00
|
|
|
assertEquals(3, p_cleaned.getTitle().size());
|
|
|
|
|
|
|
|
List<String> titles = p_cleaned
|
|
|
|
.getTitle()
|
|
|
|
.stream()
|
|
|
|
.map(StructuredProperty::getValue)
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
assertTrue(titles.contains("omic"));
|
|
|
|
assertTrue(
|
|
|
|
titles.contains("Optical response of strained- and unstrained-silicon cold-electron bolometers test"));
|
|
|
|
assertTrue(titles.contains("「マキャベリ的知性と心の理論の進化論」 リチャード・バーン, アンドリュー・ホワイトゥン 編/藤田和生, 山下博志, 友永雅巳 監訳"));
|
2021-04-13 14:32:41 +02:00
|
|
|
|
2021-03-25 11:07:59 +01:00
|
|
|
assertEquals("CLOSED", p_cleaned.getBestaccessright().getClassid());
|
2020-07-30 17:03:53 +02:00
|
|
|
assertNull(p_out.getPublisher());
|
2020-07-08 17:53:35 +02:00
|
|
|
|
2021-06-11 16:53:01 +02:00
|
|
|
assertEquals("1970-10-07", p_cleaned.getDateofacceptance().getValue());
|
|
|
|
|
2021-12-09 16:44:28 +01:00
|
|
|
assertEquals("0038", p_cleaned.getInstance().get(2).getInstancetype().getClassid());
|
|
|
|
assertEquals("Other literature type", p_cleaned.getInstance().get(2).getInstancetype().getClassname());
|
|
|
|
|
2021-03-25 11:07:59 +01:00
|
|
|
final List<Instance> pci = p_cleaned.getInstance();
|
|
|
|
assertNotNull(pci);
|
2021-12-09 16:44:28 +01:00
|
|
|
assertEquals(3, pci.size());
|
2021-03-25 11:07:59 +01:00
|
|
|
|
|
|
|
final Instance pcii = pci.get(0);
|
|
|
|
assertNotNull(pcii);
|
|
|
|
assertNotNull(pcii.getPid());
|
|
|
|
|
|
|
|
assertEquals(2, pcii.getPid().size());
|
|
|
|
|
|
|
|
assertTrue(
|
2021-09-15 16:10:37 +02:00
|
|
|
pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1007/s109090161569x")));
|
|
|
|
assertTrue(pcii.getPid().stream().anyMatch(s -> s.getValue().equals("10.1008/abcd")));
|
2021-03-25 11:07:59 +01:00
|
|
|
|
|
|
|
assertNotNull(pcii.getAlternateIdentifier());
|
|
|
|
assertEquals(1, pcii.getAlternateIdentifier().size());
|
|
|
|
assertTrue(
|
|
|
|
pcii
|
|
|
|
.getAlternateIdentifier()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.anyMatch(s -> s.getValue().equals("10.1009/qwerty")));
|
2021-03-25 11:07:59 +01:00
|
|
|
|
|
|
|
getAuthorPids(p_cleaned).forEach(pid -> {
|
2020-07-30 16:24:39 +02:00
|
|
|
System.out
|
|
|
|
.println(
|
|
|
|
String
|
|
|
|
.format(
|
|
|
|
"%s [%s - %s]", pid.getValue(), pid.getQualifier().getClassid(),
|
|
|
|
pid.getQualifier().getClassname()));
|
|
|
|
});
|
|
|
|
|
2020-06-09 17:20:40 +02:00
|
|
|
// TODO add more assertions to verity the cleaned values
|
2021-03-25 11:07:59 +01:00
|
|
|
System.out.println(MAPPER.writeValueAsString(p_cleaned));
|
2020-06-09 17:20:40 +02:00
|
|
|
}
|
|
|
|
|
2020-07-30 16:24:39 +02:00
|
|
|
private Stream<Qualifier> getAuthorPidTypes(Result pub) {
|
2020-06-09 19:52:53 +02:00
|
|
|
return pub
|
|
|
|
.getAuthor()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.map(Author::getPid)
|
|
|
|
.flatMap(Collection::stream)
|
|
|
|
.map(StructuredProperty::getQualifier);
|
2020-06-09 19:52:53 +02:00
|
|
|
}
|
|
|
|
|
2020-07-30 16:24:39 +02:00
|
|
|
private Stream<StructuredProperty> getAuthorPids(Result pub) {
|
|
|
|
return pub
|
|
|
|
.getAuthor()
|
|
|
|
.stream()
|
2021-09-15 16:10:37 +02:00
|
|
|
.map(Author::getPid)
|
|
|
|
.flatMap(Collection::stream);
|
2020-07-30 16:24:39 +02:00
|
|
|
}
|
|
|
|
|
2020-06-09 17:20:40 +02:00
|
|
|
private List<String> vocs() throws IOException {
|
|
|
|
return IOUtils
|
2021-05-10 16:43:39 +02:00
|
|
|
.readLines(
|
|
|
|
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
2020-06-09 17:20:40 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
private List<String> synonyms() throws IOException {
|
|
|
|
return IOUtils
|
2021-05-10 16:43:39 +02:00
|
|
|
.readLines(
|
|
|
|
GraphCleaningFunctionsTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
2020-06-09 17:20:40 +02:00
|
|
|
}
|
2021-11-21 16:35:22 +01:00
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testCleanDoiBoost() throws IOException {
|
2021-11-26 15:38:16 +01:00
|
|
|
String json = IOUtils
|
|
|
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub.json"));
|
2021-11-21 16:35:22 +01:00
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
2021-12-09 16:44:28 +01:00
|
|
|
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
2021-11-21 16:35:22 +01:00
|
|
|
|
2021-11-26 15:38:16 +01:00
|
|
|
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
2021-11-21 16:35:22 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void testCleanDoiBoost2() throws IOException {
|
2021-11-26 15:38:16 +01:00
|
|
|
String json = IOUtils
|
|
|
|
.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/doiboostpub2.json"));
|
2021-11-21 16:35:22 +01:00
|
|
|
Publication p_in = MAPPER.readValue(json, Publication.class);
|
|
|
|
Publication p_out = OafCleaner.apply(GraphCleaningFunctions.fixVocabularyNames(p_in), mapping);
|
2021-12-09 16:44:28 +01:00
|
|
|
Publication cleaned = GraphCleaningFunctions.cleanup(p_out, vocabularies);
|
2021-11-21 16:35:22 +01:00
|
|
|
|
2021-11-26 15:38:16 +01:00
|
|
|
Assertions.assertEquals(true, GraphCleaningFunctions.filter(cleaned));
|
2021-11-21 16:35:22 +01:00
|
|
|
|
|
|
|
}
|
2020-06-09 17:20:40 +02:00
|
|
|
}
|