2020-05-22 12:34:00 +02:00
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
package eu.dnetlib.dhp.oa.dedup;
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
import java.io.BufferedReader;
|
|
|
|
import java.io.FileReader;
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.Serializable;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.util.*;
|
|
|
|
|
|
|
|
import org.codehaus.jackson.map.ObjectMapper;
|
|
|
|
import org.junit.jupiter.api.BeforeEach;
|
|
|
|
import org.junit.jupiter.api.Test;
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
|
|
import scala.Tuple2;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
public class EntityMergerTest implements Serializable {
|
|
|
|
|
|
|
|
List<Tuple2<String, Publication>> publications;
|
2020-05-22 17:24:57 +02:00
|
|
|
List<Tuple2<String, Publication>> publications2;
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
String testEntityBasePath;
|
|
|
|
DataInfo dataInfo;
|
|
|
|
String dedupId = "dedup_id";
|
|
|
|
Publication pub_top;
|
|
|
|
|
|
|
|
@BeforeEach
|
|
|
|
public void setUp() throws Exception {
|
|
|
|
|
|
|
|
testEntityBasePath = Paths
|
2020-05-22 12:34:00 +02:00
|
|
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
|
|
|
.toFile()
|
|
|
|
.getAbsolutePath();
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
2020-05-22 17:24:57 +02:00
|
|
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
pub_top = getTopPub(publications);
|
|
|
|
|
|
|
|
dataInfo = setDI();
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-07-08 17:03:53 +02:00
|
|
|
@Test
|
|
|
|
public void softwareMergerTest() throws InstantiationException, IllegalAccessException {
|
2020-07-10 16:12:08 +02:00
|
|
|
List<Tuple2<String, Software>> softwares = readSample(
|
|
|
|
testEntityBasePath + "/software_merge.json", Software.class);
|
2020-07-08 17:03:53 +02:00
|
|
|
|
|
|
|
Software merged = DedupRecordFactory
|
2020-07-10 16:12:08 +02:00
|
|
|
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
2020-07-08 17:03:53 +02:00
|
|
|
|
2020-07-12 19:28:56 +02:00
|
|
|
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
2020-07-08 17:03:53 +02:00
|
|
|
}
|
|
|
|
|
2020-05-21 11:52:14 +02:00
|
|
|
@Test
|
|
|
|
public void publicationMergerTest() throws InstantiationException, IllegalAccessException {
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
Publication pub_merged = DedupRecordFactory
|
|
|
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
assertEquals(dedupId, pub_merged.getId());
|
|
|
|
|
|
|
|
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
|
|
|
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
|
|
|
|
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
|
|
|
|
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
|
|
|
|
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
|
|
|
|
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
|
|
|
|
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
|
|
|
|
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
|
|
|
|
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
|
|
|
|
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
|
2020-05-22 12:34:00 +02:00
|
|
|
assertEquals(pub_merged.getInstance().size(), 3);
|
2020-05-21 11:52:14 +02:00
|
|
|
assertEquals(pub_merged.getCountry().size(), 2);
|
|
|
|
assertEquals(pub_merged.getSubject().size(), 0);
|
|
|
|
assertEquals(pub_merged.getTitle().size(), 2);
|
2020-05-22 12:34:00 +02:00
|
|
|
assertEquals(pub_merged.getRelevantdate().size(), 0);
|
|
|
|
assertEquals(pub_merged.getDescription().size(), 0);
|
|
|
|
assertEquals(pub_merged.getSource().size(), 0);
|
|
|
|
assertEquals(pub_merged.getFulltext().size(), 0);
|
|
|
|
assertEquals(pub_merged.getFormat().size(), 0);
|
|
|
|
assertEquals(pub_merged.getContributor().size(), 0);
|
|
|
|
assertEquals(pub_merged.getCoverage().size(), 0);
|
|
|
|
assertEquals(pub_merged.getContext().size(), 0);
|
|
|
|
assertEquals(pub_merged.getExternalReference().size(), 0);
|
|
|
|
assertEquals(pub_merged.getOriginalId().size(), 3);
|
|
|
|
assertEquals(pub_merged.getCollectedfrom().size(), 3);
|
|
|
|
assertEquals(pub_merged.getPid().size(), 1);
|
|
|
|
assertEquals(pub_merged.getExtraInfo().size(), 0);
|
|
|
|
|
|
|
|
// verify datainfo
|
2020-05-21 11:52:14 +02:00
|
|
|
assertEquals(pub_merged.getDataInfo(), dataInfo);
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
// verify datepicker
|
2020-05-21 11:52:14 +02:00
|
|
|
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
// verify authors
|
2020-05-21 11:52:14 +02:00
|
|
|
assertEquals(pub_merged.getAuthor().size(), 9);
|
|
|
|
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
2020-05-25 18:02:57 +02:00
|
|
|
|
2020-05-26 09:54:13 +02:00
|
|
|
// verify title
|
2020-05-25 18:02:57 +02:00
|
|
|
int count = 0;
|
2020-05-26 09:54:13 +02:00
|
|
|
for (StructuredProperty title : pub_merged.getTitle()) {
|
2020-05-25 18:02:57 +02:00
|
|
|
if (title.getQualifier().getClassid().equals("main title"))
|
|
|
|
count++;
|
|
|
|
}
|
|
|
|
assertEquals(count, 1);
|
2020-05-21 11:52:14 +02:00
|
|
|
}
|
|
|
|
|
2020-05-22 17:24:57 +02:00
|
|
|
@Test
|
2020-05-25 18:02:57 +02:00
|
|
|
public void publicationMergerTest2() throws InstantiationException, IllegalAccessException {
|
2020-05-22 17:24:57 +02:00
|
|
|
|
|
|
|
Publication pub_merged = DedupRecordFactory
|
2020-05-23 09:51:48 +02:00
|
|
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
2020-05-22 17:24:57 +02:00
|
|
|
|
|
|
|
assertEquals(pub_merged.getAuthor().size(), 27);
|
|
|
|
// insert assertions here
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
public DataInfo setDI() {
|
2020-05-21 11:52:14 +02:00
|
|
|
DataInfo dataInfo = new DataInfo();
|
|
|
|
dataInfo.setTrust("0.9");
|
|
|
|
dataInfo.setDeletedbyinference(false);
|
|
|
|
dataInfo.setInferenceprovenance("testing");
|
|
|
|
dataInfo.setInferred(true);
|
|
|
|
return dataInfo;
|
|
|
|
}
|
|
|
|
|
2020-05-22 12:34:00 +02:00
|
|
|
public Publication getTopPub(List<Tuple2<String, Publication>> publications) {
|
2020-05-21 11:52:14 +02:00
|
|
|
|
|
|
|
Double maxTrust = 0.0;
|
|
|
|
Publication maxPub = new Publication();
|
|
|
|
for (Tuple2<String, Publication> publication : publications) {
|
|
|
|
Double pubTrust = Double.parseDouble(publication._2().getDataInfo().getTrust());
|
2020-05-22 12:34:00 +02:00
|
|
|
if (pubTrust > maxTrust) {
|
2020-05-21 11:52:14 +02:00
|
|
|
maxTrust = pubTrust;
|
|
|
|
maxPub = publication._2();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return maxPub;
|
|
|
|
}
|
|
|
|
|
|
|
|
public <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
|
|
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
|
|
|
BufferedReader reader;
|
|
|
|
try {
|
|
|
|
reader = new BufferedReader(new FileReader(path));
|
|
|
|
String line = reader.readLine();
|
|
|
|
while (line != null) {
|
2020-05-22 12:34:00 +02:00
|
|
|
res
|
|
|
|
.add(
|
2020-05-21 11:52:14 +02:00
|
|
|
new Tuple2<>(
|
2020-05-22 12:34:00 +02:00
|
|
|
MapDocumentUtil.getJPathString("$.id", line),
|
|
|
|
new ObjectMapper().readValue(line, clazz)));
|
2020-05-21 11:52:14 +02:00
|
|
|
// read next line
|
|
|
|
line = reader.readLine();
|
|
|
|
}
|
|
|
|
reader.close();
|
|
|
|
} catch (IOException e) {
|
|
|
|
e.printStackTrace();
|
|
|
|
}
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|