bug fix in the idgenerator and test implementation

This commit is contained in:
miconis 2020-10-09 09:30:23 +02:00
parent 1804c5d809
commit 6f8720982c
5 changed files with 165 additions and 13 deletions

View File

@ -1,4 +1,3 @@
package eu.dnetlib.dhp.oa.dedup;
import java.util.*;

View File

@ -24,6 +24,7 @@ public class IdGenerator implements Serializable {
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String BASE_DATE = "2000-01-01";
// pick the best pid from the list (consider date and pidtype)
public static String generate(List<Identifier> pids, String defaultID) {
@ -45,14 +46,27 @@ public class IdGenerator implements Serializable {
}
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
Date date;
try {
date = sdf.parse(BASE_DATE);
} catch (ParseException e) {
date = new Date();
}
return Lists
.newArrayList(
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
}
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
if (entity.getPid() == null || entity.getPid().size() == 0)
return Lists
.newArrayList(
new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
return createBasePid(entity, sdf);
Optional<StructuredProperty> bp = entity
.getPid()
@ -64,14 +78,10 @@ public class IdGenerator implements Serializable {
.map(
structuredProperty -> Lists
.newArrayList(
new Identifier(structuredProperty, extractDate(entity, new SimpleDateFormat("yyyy-MM-dd")),
new Identifier(structuredProperty, extractDate(entity, sdf),
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
.orElseGet(
() -> Lists
.newArrayList(
new Identifier(new StructuredProperty(), new Date(), PidType.original,
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
.orElseGet(() -> createBasePid(entity, sdf));
}
@ -91,7 +101,7 @@ public class IdGenerator implements Serializable {
// 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
String date = "2000-01-01";
String date = BASE_DATE;
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())) {

View File

@ -118,7 +118,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) > 0)
if (this.originalID.compareTo(i.originalID) < 0)
this.useOriginal = true;
else
i.setUseOriginal(true);

View File

@ -0,0 +1,140 @@
package eu.dnetlib.dhp.oa.dedup;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.*;
import scala.Tuple2;
import static org.junit.jupiter.api.Assertions.assertEquals;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class IdGeneratorTest {
private static List<Identifier> bestIds;
private static List<Tuple2<String, Publication>> pubs;
private static List<Identifier> bestIds2;
private static List<Identifier> bestIds3;
private static String testEntityBasePath;
private static SimpleDateFormat sdf;
private static Date baseDate;
@BeforeAll
public static void setUp() throws Exception {
sdf = new SimpleDateFormat("yyyy-MM-dd");
baseDate = sdf.parse("2000-01-01");
bestIds = new ArrayList<>();
bestIds2 = Lists.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
);
bestIds3 = Lists.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
);
testEntityBasePath = Paths
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
.toFile()
.getAbsolutePath();
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
}
@Test
@Order(1)
public void bestPidToIdentifierTest(){
List<String> typesForAssertions = Lists.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
for (Tuple2<String, Publication> pub : pubs) {
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
bestIds.addAll(ids);
}
}
@Test
@Order(2)
public void generateIdTest1(){
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
assertEquals("50|dedup_doi___::84f2cc49e3af11f20952eae15cdae066", id1);
}
@Test
public void generateIdTest2(){
String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
}
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
List<Tuple2<String, T>> res = new ArrayList<>();
BufferedReader reader;
try {
reader = new BufferedReader(new FileReader(path));
String line = reader.readLine();
while (line != null) {
res
.add(
new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz)));
// read next line
line = reader.readLine();
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return res;
}
public static StructuredProperty pid(String pid, String classid, String classname){
StructuredProperty sp = new StructuredProperty();
sp.setValue(pid);
Qualifier q = new Qualifier();
q.setSchemeid(classid);
q.setSchemename(classname);
q.setClassname(classname);
q.setClassid(classid);
sp.setQualifier(q);
return sp;
}
public static List<KeyValue> keyValue(String key, String value){
KeyValue kv = new KeyValue();
kv.setKey(key);
kv.setValue(value);
return Lists.newArrayList(kv);
}
}