2020-05-22 12:34:00 +02:00
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
package eu.dnetlib.dhp.oa.dedup;
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
import java.io.BufferedReader;
|
|
|
|
import java.io.FileReader;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.Serializable;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.util.*;
|
2020-11-04 15:02:02 +01:00
|
|
|
import java.util.stream.Collectors;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
import org.codehaus.jackson.map.ObjectMapper;
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
2020-10-08 17:29:29 +02:00
|
|
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger;
|
2020-05-22 12:34:00 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
|
|
import scala.Tuple2;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
public class EntityMergerTest implements Serializable {
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private List<Tuple2<String, Publication>> publications;
|
|
|
|
private List<Tuple2<String, Publication>> publications2;
|
|
|
|
private List<Tuple2<String, Publication>> publications3;
|
|
|
|
private List<Tuple2<String, Publication>> publications4;
|
|
|
|
private List<Tuple2<String, Publication>> publications5;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private String testEntityBasePath;
|
|
|
|
private DataInfo dataInfo;
|
|
|
|
private String dedupId = "00|dedup_id::1";
|
|
|
|
private Publication pub_top;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() throws Exception {
|
|
|
|
|
|
|
|
testEntityBasePath = Paths
|
2020-05-22 12:34:00 +02:00
|
|
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
2020-05-22 17:24:57 +02:00
|
|
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
2020-07-22 17:29:48 +02:00
|
|
|
publications3 = readSample(testEntityBasePath + "/publication_merge3.json", Publication.class);
|
2020-07-24 20:10:47 +02:00
|
|
|
publications4 = readSample(testEntityBasePath + "/publication_merge4.json", Publication.class);
|
|
|
|
publications5 = readSample(testEntityBasePath + "/publication_merge5.json", Publication.class);
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
pub_top = getTopPub(publications);
|
|
|
|
|
|
|
|
dataInfo = setDI();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-07-08 17:03:53 +02:00
|
|
|
@Test
|
|
|
|
public void softwareMergerTest() throws InstantiationException, IllegalAccessException {
|
2020-10-20 15:12:19 +02:00
|
|
|
|
2020-07-10 16:12:08 +02:00
|
|
|
List<Tuple2<String, Software>> softwares = readSample(
|
|
|
|
testEntityBasePath + "/software_merge.json", Software.class);
|
2020-07-08 17:03:53 +02:00
|
|
|
|
|
|
|
Software merged = DedupRecordFactory
|
2020-07-10 16:12:08 +02:00
|
|
|
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
2020-07-08 17:03:53 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, merged.getId());
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2020-07-08 17:03:53 +02:00
|
|
|
}
|
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
@Test
|
|
|
|
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
Publication pub_merged = DedupRecordFactory
|
|
|
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
2020-05-21 11:52:14 +02:00
|
|
|
|
2020-07-22 17:29:48 +02:00
|
|
|
// verify id
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
2020-11-04 15:02:02 +01:00
|
|
|
|
|
|
|
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
|
|
|
|
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
|
|
|
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
|
|
|
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
|
|
|
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
|
|
|
|
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
|
|
|
|
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
|
|
|
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
|
|
|
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
|
|
|
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
|
|
|
assertEquals(3, pub_merged.getInstance().size());
|
|
|
|
assertEquals(2, pub_merged.getCountry().size());
|
|
|
|
assertEquals(0, pub_merged.getSubject().size());
|
|
|
|
assertEquals(2, pub_merged.getTitle().size());
|
|
|
|
assertEquals(0, pub_merged.getRelevantdate().size());
|
|
|
|
assertEquals(0, pub_merged.getDescription().size());
|
|
|
|
assertEquals(0, pub_merged.getSource().size());
|
|
|
|
assertEquals(0, pub_merged.getFulltext().size());
|
|
|
|
assertEquals(0, pub_merged.getFormat().size());
|
|
|
|
assertEquals(0, pub_merged.getContributor().size());
|
|
|
|
assertEquals(0, pub_merged.getCoverage().size());
|
|
|
|
assertEquals(0, pub_merged.getContext().size());
|
|
|
|
assertEquals(0, pub_merged.getExternalReference().size());
|
|
|
|
assertEquals(3, pub_merged.getOriginalId().size());
|
|
|
|
assertEquals(3, pub_merged.getCollectedfrom().size());
|
|
|
|
assertEquals(1, pub_merged.getPid().size());
|
|
|
|
assertEquals(0, pub_merged.getExtraInfo().size());
|
2020-05-22 12:34:00 +02:00
|
|
|
|
|
|
|
// verify datainfo
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals(dataInfo, pub_merged.getDataInfo());
|
2020-05-21 11:52:14 +02:00
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
// verify datepicker
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
2020-05-21 11:52:14 +02:00
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
// verify authors
|
2021-03-29 10:07:12 +02:00
|
|
|
assertEquals(13, pub_merged.getAuthor().size());
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
2020-05-25 18:02:57 +02:00
|
|
|
|
2020-05-26 09:54:13 +02:00
|
|
|
// verify title
|
2020-05-25 18:02:57 +02:00
|
|
|
int count = 0;
|
2020-05-26 09:54:13 +02:00
|
|
|
for (StructuredProperty title : pub_merged.getTitle()) {
|
2020-05-25 18:02:57 +02:00
|
|
|
if (title.getQualifier().getClassid().equals("main title"))
|
|
|
|
count++;
|
|
|
|
}
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals(1, count);
|
2020-05-21 11:52:14 +02:00
|
|
|
}
|
|
|
|
|
2020-05-22 17:24:57 +02:00
|
|
|
@Test
|
2020-05-25 18:02:57 +02:00
|
|
|
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
|
2020-05-22 17:24:57 +02:00
|
|
|
|
|
|
|
Publication pub_merged = DedupRecordFactory
|
2020-05-23 09:51:48 +02:00
|
|
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
2020-05-22 17:24:57 +02:00
|
|
|
|
2020-07-22 17:29:48 +02:00
|
|
|
// verify id
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
assertEquals(27, pub_merged.getAuthor().size());
|
2020-05-22 17:24:57 +02:00
|
|
|
}
|
|
|
|
|
2020-07-22 17:29:48 +02:00
|
|
|
@Test
|
|
|
|
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
|
|
|
|
|
|
|
Publication pub_merged = DedupRecordFactory
|
2020-09-29 15:31:46 +02:00
|
|
|
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
2020-07-22 17:29:48 +02:00
|
|
|
|
|
|
|
// verify id
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
2020-07-24 20:10:47 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Test
|
|
|
|
public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
|
|
|
|
|
|
|
Publication pub_merged = DedupRecordFactory
|
2020-09-29 15:31:46 +02:00
|
|
|
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
2020-07-24 20:10:47 +02:00
|
|
|
|
|
|
|
// verify id
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
2020-07-22 17:29:48 +02:00
|
|
|
}
|
|
|
|
|
2020-07-24 20:10:47 +02:00
|
|
|
@Test
|
|
|
|
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
System.out
|
|
|
|
.println(
|
|
|
|
publications5
|
|
|
|
.stream()
|
|
|
|
.map(p -> p._2().getId())
|
|
|
|
.collect(Collectors.toList()));
|
|
|
|
|
2020-07-24 20:10:47 +02:00
|
|
|
Publication pub_merged = DedupRecordFactory
|
2020-09-29 15:31:46 +02:00
|
|
|
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
2020-07-24 20:10:47 +02:00
|
|
|
|
|
|
|
// verify id
|
2021-04-27 09:47:49 +02:00
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
2020-07-24 20:10:47 +02:00
|
|
|
}
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
public DataInfo setDI() {
|
2020-05-21 11:52:14 +02:00
|
|
|
DataInfo dataInfo = new DataInfo();
|
|
|
|
dataInfo.setTrust("0.9");
|
|
|
|
dataInfo.setDeletedbyinference(false);
|
|
|
|
dataInfo.setInferenceprovenance("testing");
|
|
|
|
dataInfo.setInferred(true);
|
|
|
|
return dataInfo;
|
|
|
|
}
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
public Publication getTopPub(List<Tuple2<String, Publication>> publications) {
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
Double maxTrust = 0.0;
|
|
|
|
Publication maxPub = new Publication();
|
|
|
|
for (Tuple2<String, Publication> publication : publications) {
|
|
|
|
Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
|
2020-05-22 12:34:00 +02:00
|
|
|
if (pubTrust > maxTrust) {
|
2020-05-21 11:52:14 +02:00
|
|
|
maxTrust = pubTrust;
|
|
|
|
maxPub = publication._2();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return maxPub;
|
|
|
|
}
|
|
|
|
|
|
|
|
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
|
|
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
|
|
|
BufferedReader reader;
|
|
|
|
try {
|
|
|
|
reader = new BufferedReader(new FileReader(path));
|
|
|
|
String line = reader.readLine();
|
|
|
|
while (line != null) {
|
2020-05-22 12:34:00 +02:00
|
|
|
res
|
|
|
|
.add(
|
2020-05-21 11:52:14 +02:00
|
|
|
new Tuple2<>(
|
2020-05-22 12:34:00 +02:00
|
|
|
MapDocumentUtil.getJPathString("$.id", line),
|
|
|
|
new ObjectMapper().readValue(line, clazz)));
|
2020-05-21 11:52:14 +02:00
|
|
|
// read next line
|
|
|
|
line = reader.readLine();
|
|
|
|
}
|
|
|
|
reader.close();
|
|
|
|
} catch (IOException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|