forked from D-Net/dnet-hadoop
bug fix in the idgenerator and test implementation
This commit is contained in:
parent
1804c5d809
commit
6f8720982c
|
@ -1,4 +1,3 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
|
@ -24,6 +24,7 @@ public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||||
|
public static String BASE_DATE = "2000-01-01";
|
||||||
|
|
||||||
// pick the best pid from the list (consider date and pidtype)
|
// pick the best pid from the list (consider date and pidtype)
|
||||||
public static String generate(List<Identifier> pids, String defaultID) {
|
public static String generate(List<Identifier> pids, String defaultID) {
|
||||||
|
@ -45,14 +46,27 @@ public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
||||||
|
|
||||||
|
Date date;
|
||||||
|
try {
|
||||||
|
date = sdf.parse(BASE_DATE);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
date = new Date();
|
||||||
|
}
|
||||||
|
return Lists
|
||||||
|
.newArrayList(
|
||||||
|
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
||||||
|
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||||
|
}
|
||||||
|
|
||||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||||
|
|
||||||
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
|
|
||||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||||
return Lists
|
return createBasePid(entity, sdf);
|
||||||
.newArrayList(
|
|
||||||
new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(),
|
|
||||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
|
||||||
|
|
||||||
Optional<StructuredProperty> bp = entity
|
Optional<StructuredProperty> bp = entity
|
||||||
.getPid()
|
.getPid()
|
||||||
|
@ -64,14 +78,10 @@ public class IdGenerator implements Serializable {
|
||||||
.map(
|
.map(
|
||||||
structuredProperty -> Lists
|
structuredProperty -> Lists
|
||||||
.newArrayList(
|
.newArrayList(
|
||||||
new Identifier(structuredProperty, extractDate(entity, new SimpleDateFormat("yyyy-MM-dd")),
|
new Identifier(structuredProperty, extractDate(entity, sdf),
|
||||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
||||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
||||||
.orElseGet(
|
.orElseGet(() -> createBasePid(entity, sdf));
|
||||||
() -> Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(new StructuredProperty(), new Date(), PidType.original,
|
|
||||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,7 +101,7 @@ public class IdGenerator implements Serializable {
|
||||||
// 00-01-01
|
// 00-01-01
|
||||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
||||||
|
|
||||||
String date = "2000-01-01";
|
String date = BASE_DATE;
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result result = (Result) duplicate;
|
Result result = (Result) duplicate;
|
||||||
if (isWellformed(result.getDateofacceptance())) {
|
if (isWellformed(result.getDateofacceptance())) {
|
||||||
|
|
|
@ -118,7 +118,7 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
|
|
||||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||||
|
|
||||||
if (this.originalID.compareTo(i.originalID) > 0)
|
if (this.originalID.compareTo(i.originalID) < 0)
|
||||||
this.useOriginal = true;
|
this.useOriginal = true;
|
||||||
else
|
else
|
||||||
i.setUseOriginal(true);
|
i.setUseOriginal(true);
|
||||||
|
|
|
@ -0,0 +1,140 @@
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
|
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
|
import org.junit.jupiter.api.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Date;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
public class IdGeneratorTest {
|
||||||
|
|
||||||
|
private static List<Identifier> bestIds;
|
||||||
|
private static List<Tuple2<String, Publication>> pubs;
|
||||||
|
|
||||||
|
private static List<Identifier> bestIds2;
|
||||||
|
private static List<Identifier> bestIds3;
|
||||||
|
|
||||||
|
private static String testEntityBasePath;
|
||||||
|
|
||||||
|
private static SimpleDateFormat sdf;
|
||||||
|
private static Date baseDate;
|
||||||
|
|
||||||
|
@BeforeAll
|
||||||
|
public static void setUp() throws Exception {
|
||||||
|
|
||||||
|
sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
|
baseDate = sdf.parse("2000-01-01");
|
||||||
|
|
||||||
|
bestIds = new ArrayList<>();
|
||||||
|
bestIds2 = Lists.newArrayList(
|
||||||
|
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||||
|
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
||||||
|
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
|
||||||
|
);
|
||||||
|
bestIds3 = Lists.newArrayList(
|
||||||
|
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
||||||
|
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
||||||
|
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original, keyValue("key", "value"), EntityType.publication, "50|originalID3")
|
||||||
|
);
|
||||||
|
|
||||||
|
testEntityBasePath = Paths
|
||||||
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||||
|
.toFile()
|
||||||
|
.getAbsolutePath();
|
||||||
|
|
||||||
|
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(1)
|
||||||
|
public void bestPidToIdentifierTest(){
|
||||||
|
|
||||||
|
List<String> typesForAssertions = Lists.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
|
||||||
|
|
||||||
|
for (Tuple2<String, Publication> pub : pubs) {
|
||||||
|
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
|
||||||
|
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
|
||||||
|
bestIds.addAll(ids);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
@Order(2)
|
||||||
|
public void generateIdTest1(){
|
||||||
|
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
||||||
|
|
||||||
|
assertEquals("50|dedup_doi___::84f2cc49e3af11f20952eae15cdae066", id1);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void generateIdTest2(){
|
||||||
|
String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
|
||||||
|
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
||||||
|
|
||||||
|
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
|
||||||
|
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
List<Tuple2<String, T>> res = new ArrayList<>();
|
||||||
|
BufferedReader reader;
|
||||||
|
try {
|
||||||
|
reader = new BufferedReader(new FileReader(path));
|
||||||
|
String line = reader.readLine();
|
||||||
|
while (line != null) {
|
||||||
|
res
|
||||||
|
.add(
|
||||||
|
new Tuple2<>(
|
||||||
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
|
new ObjectMapper().readValue(line, clazz)));
|
||||||
|
// read next line
|
||||||
|
line = reader.readLine();
|
||||||
|
}
|
||||||
|
reader.close();
|
||||||
|
} catch (IOException e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static StructuredProperty pid(String pid, String classid, String classname){
|
||||||
|
|
||||||
|
StructuredProperty sp = new StructuredProperty();
|
||||||
|
sp.setValue(pid);
|
||||||
|
Qualifier q = new Qualifier();
|
||||||
|
q.setSchemeid(classid);
|
||||||
|
q.setSchemename(classname);
|
||||||
|
q.setClassname(classname);
|
||||||
|
q.setClassid(classid);
|
||||||
|
sp.setQualifier(q);
|
||||||
|
return sp;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static List<KeyValue> keyValue(String key, String value){
|
||||||
|
|
||||||
|
KeyValue kv = new KeyValue();
|
||||||
|
kv.setKey(key);
|
||||||
|
kv.setValue(value);
|
||||||
|
return Lists.newArrayList(kv);
|
||||||
|
}
|
||||||
|
}
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue