forked from antonis.lempesis/dnet-hadoop
bug fix in the idgenerator and test implementation
This commit is contained in:
parent
1804c5d809
commit
6f8720982c
|
@ -1,4 +1,3 @@
|
|||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.util.*;
|
||||
|
|
|
@ -24,6 +24,7 @@ public class IdGenerator implements Serializable {
|
|||
|
||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||
public static String BASE_DATE = "2000-01-01";
|
||||
|
||||
// pick the best pid from the list (consider date and pidtype)
|
||||
public static String generate(List<Identifier> pids, String defaultID) {
|
||||
|
@ -45,14 +46,27 @@ public class IdGenerator implements Serializable {
|
|||
|
||||
}
|
||||
|
||||
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
||||
|
||||
Date date;
|
||||
try {
|
||||
date = sdf.parse(BASE_DATE);
|
||||
} catch (ParseException e) {
|
||||
date = new Date();
|
||||
}
|
||||
return Lists
|
||||
.newArrayList(
|
||||
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||
}
|
||||
|
||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||
return Lists
|
||||
.newArrayList(
|
||||
new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(),
|
||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||
return createBasePid(entity, sdf);
|
||||
|
||||
Optional<StructuredProperty> bp = entity
|
||||
.getPid()
|
||||
|
@ -64,14 +78,10 @@ public class IdGenerator implements Serializable {
|
|||
.map(
|
||||
structuredProperty -> Lists
|
||||
.newArrayList(
|
||||
new Identifier(structuredProperty, extractDate(entity, new SimpleDateFormat("yyyy-MM-dd")),
|
||||
new Identifier(structuredProperty, extractDate(entity, sdf),
|
||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
||||
.orElseGet(
|
||||
() -> Lists
|
||||
.newArrayList(
|
||||
new Identifier(new StructuredProperty(), new Date(), PidType.original,
|
||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
|
||||
.orElseGet(() -> createBasePid(entity, sdf));
|
||||
|
||||
}
|
||||
|
||||
|
@ -91,7 +101,7 @@ public class IdGenerator implements Serializable {
|
|||
// 00-01-01
|
||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
||||
|
||||
String date = "2000-01-01";
|
||||
String date = BASE_DATE;
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result result = (Result) duplicate;
|
||||
if (isWellformed(result.getDateofacceptance())) {
|
||||
|
|
|
@ -118,7 +118,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
|||
|
||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||
|
||||
if (this.originalID.compareTo(i.originalID) > 0)
|
||||
if (this.originalID.compareTo(i.originalID) < 0)
|
||||
this.useOriginal = true;
|
||||
else
|
||||
i.setUseOriginal(true);
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
import org.junit.jupiter.api.*;
|
||||
import scala.Tuple2;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Paths;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class IdGeneratorTest {
|
||||
|
||||
private static List<Identifier> bestIds;
|
||||
private static List<Tuple2<String, Publication>> pubs;
|
||||
|
||||
private static List<Identifier> bestIds2;
|
||||
private static List<Identifier> bestIds3;
|
||||
|
||||
private static String testEntityBasePath;
|
||||
|
||||
private static SimpleDateFormat sdf;
|
||||
private static Date baseDate;
|
||||
|
||||
@BeforeAll
|
||||
public static void setUp() throws Exception {
|
||||
|
||||
sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
baseDate = sdf.parse("2000-01-01");
|
||||
|
||||
bestIds = new ArrayList<>();
|
||||
bestIds2 = Lists.newArrayList(
|
||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
|
||||
);
|
||||
bestIds3 = Lists.newArrayList(
|
||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
|
||||
);
|
||||
|
||||
testEntityBasePath = Paths
|
||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||
.toFile()
|
||||
.getAbsolutePath();
|
||||
|
||||
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(1)
|
||||
public void bestPidToIdentifierTest(){
|
||||
|
||||
List<String> typesForAssertions = Lists.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
|
||||
|
||||
for (Tuple2<String, Publication> pub : pubs) {
|
||||
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
|
||||
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
|
||||
bestIds.addAll(ids);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(2)
|
||||
public void generateIdTest1(){
|
||||
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
||||
|
||||
assertEquals("50|dedup_doi___::84f2cc49e3af11f20952eae15cdae066", id1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void generateIdTest2(){
|
||||
String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
|
||||
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
||||
|
||||
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
|
||||
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
|
||||
}
|
||||
|
||||
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||
BufferedReader reader;
|
||||
try {
|
||||
reader = new BufferedReader(new FileReader(path));
|
||||
String line = reader.readLine();
|
||||
while (line != null) {
|
||||
res
|
||||
.add(
|
||||
new Tuple2<>(
|
||||
MapDocumentUtil.getJPathString("$.id", line),
|
||||
new ObjectMapper().readValue(line, clazz)));
|
||||
// read next line
|
||||
line = reader.readLine();
|
||||
}
|
||||
reader.close();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
public static StructuredProperty pid(String pid, String classid, String classname){
|
||||
|
||||
StructuredProperty sp = new StructuredProperty();
|
||||
sp.setValue(pid);
|
||||
Qualifier q = new Qualifier();
|
||||
q.setSchemeid(classid);
|
||||
q.setSchemename(classname);
|
||||
q.setClassname(classname);
|
||||
q.setClassid(classid);
|
||||
sp.setQualifier(q);
|
||||
return sp;
|
||||
}
|
||||
|
||||
public static List<KeyValue> keyValue(String key, String value){
|
||||
|
||||
KeyValue kv = new KeyValue();
|
||||
kv.setKey(key);
|
||||
kv.setValue(value);
|
||||
return Lists.newArrayList(kv);
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue