implementation of the dedup_id generation using pids to make the graph more stable

This commit is contained in:
miconis 2020-07-22 17:29:48 +02:00
parent 105176105c
commit b260fee787
7 changed files with 237 additions and 41 deletions

View File

@ -114,7 +114,7 @@ public class DatePicker {
}
}
private static boolean inRange(final String date) {
public static boolean inRange(final String date) {
final int year = Integer.parseInt(substringBefore(date, "-"));
return year >= YEAR_LB && year <= YEAR_UB;
}

View File

@ -1,11 +1,12 @@
package eu.dnetlib.dhp.oa.dedup;
import java.io.Serializable;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
@ -13,15 +14,12 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
public class DedupRecordFactory {
private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
@ -77,15 +75,23 @@ public class DedupRecordFactory {
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
throws IllegalAccessException, InstantiationException {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
T entity = clazz.newInstance();
final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList();
final List<Identifier> bestPids = Lists.newArrayList(); //best pids list
entities
.forEachRemaining(
t -> {
T duplicate = t._2();
StructuredProperty bestPid = bestPid(duplicate.getPid());
if (bestPid != null)
bestPids.add(new Identifier(bestPid, extractDate(duplicate, sdf), PidType.classidValueOf(bestPid.getQualifier().getClassid())));
entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result r1 = (Result) duplicate;
@ -94,6 +100,7 @@ public class DedupRecordFactory {
if (r1.getDateofacceptance() != null)
dates.add(r1.getDateofacceptance().getValue());
}
});
// set authors and date
@ -102,10 +109,73 @@ public class DedupRecordFactory {
((Result) entity).setAuthor(AuthorMerger.merge(authors));
}
entity.setId(id);
Identifier bestPid = winnerPid(bestPids);
if (bestPid == null)
entity.setId(id);
else
entity.setId(id.split("\\|")[0] + "|" + createPrefix(bestPid.getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bestPid.getPid().getValue()));
entity.setLastupdatetimestamp(ts);
entity.setDataInfo(dataInfo);
return entity;
}
//pick the best pid from the list (consider date and pidtype)
public static Identifier winnerPid(List<Identifier> pids) {
if (pids == null || pids.size() == 0)
return null;
Optional<Identifier> bp = pids.stream()
.filter(pid -> pid.getType() != PidType.undefined)
.max(Identifier::compareTo);
return bp.orElse(null);
}
//pick the best pid from the entity
public static StructuredProperty bestPid(List<StructuredProperty> pids) {
if (pids == null || pids.size() == 0)
return null;
Optional<StructuredProperty> bp = pids.stream()
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
return bp.orElse(null);
}
//create the prefix (length = 12): dedup_+ pidType
public static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
while (prefix.length() < 12) {
prefix.append("_");
}
return prefix.toString().substring(0, 12);
}
//extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){
String date = "2000-01-01";
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())){
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
}
public static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches("\\d{4}-\\d{2}-\\d{2}") && DatePicker.inRange(date.getValue());
}
}

View File

@ -0,0 +1,55 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.io.Serializable;
import java.util.Date;
public class Identifier implements Serializable, Comparable<Identifier>{
StructuredProperty pid;
Date date;
PidType type;
public Identifier(StructuredProperty pid, Date date, PidType type) {
this.pid = pid;
this.date = date;
this.type = type;
}
public StructuredProperty getPid() {
return pid;
}
public void setPid(StructuredProperty pidValue) {
this.pid = pid;
}
public Date getDate() {
return date;
}
public void setDate(Date date) {
this.date = date;
}
public PidType getType() {
return type;
}
public void setType(PidType type) {
this.type = type;
}
@Override
public int compareTo(Identifier i) {
//priority in comparisons: 1) pidtype, 2) date
if (this.getType().compareTo(i.getType()) == 0){ //same type
return this.getDate().compareTo(date);
}
else {
return this.getType().compareTo(i.getType());
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.oa.dedup;
public enum PidType {
//from the less to the more important
undefined,
orcid,
ror,
grid,
pdb,
arXiv,
pmid,
doi;
public static PidType classidValueOf(String s){
try {
return PidType.valueOf(s);
}
catch (Exception e) {
return PidType.undefined;
}
}
}
//dnet:pid_types
//"actrn"
//"nct"
//"euctr"
//"epo_id"
//"gsk"
//"GeoPass"
//"GBIF"
//"isrctn"
//"ISNI"
//"jprn"
//"mag_id"
//"oai"
//"orcid"
//"PANGAEA"
//"epo_nr_epodoc"
//"UNKNOWN"
//"VIAF"
//"arXiv"
//"doi"
//"grid"
//"info:eu-repo/dai"
//"orcidworkid"
//"pmc"
//"pmid"
//"urn"
//"who"
//"drks"
//"pdb"

View File

@ -22,10 +22,11 @@ public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2;
List<Tuple2<String, Publication>> publications3;
String testEntityBasePath;
DataInfo dataInfo;
String dedupId = "dedup_id";
String dedupId = "00|dedup_id::1";
Publication pub_top;
@BeforeEach
@ -38,6 +39,8 @@ public class EntityMergerTest implements Serializable {
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
publications3 = readSample(testEntityBasePath + "/publication_merge3.json", Publication.class);
pub_top = getTopPub(publications);
@ -54,6 +57,9 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
assertEquals(merged.getId(), "00|dedup_doi___::0968af610a356656706657e4f234b340");
}
@Test
@ -62,7 +68,8 @@ public class EntityMergerTest implements Serializable {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
assertEquals(dedupId, pub_merged.getId());
// verify id
assertEquals(pub_merged.getId(), "00|dedup_doi___::0968af610a356656706657e4f234b340");
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
@ -117,11 +124,25 @@ public class EntityMergerTest implements Serializable {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals(pub_merged.getId(), "00|dedup_doi___::0ca46ff10b2b4c756191719d85302b14");
assertEquals(pub_merged.getAuthor().size(), 27);
// insert assertions here
}
@Test
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
// verify id
assertEquals(pub_merged.getId(), "00|dedup_doi___::0ca46ff10b2b4c756191719d85302b14");
}
public DataInfo setDI() {
DataInfo dataInfo = new DataInfo();
dataInfo.setTrust("0.9");

View File

@ -1,22 +1,12 @@
package eu.dnetlib.dhp.oa.dedup;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.col;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import java.util.HashSet;
import java.util.Set;
import java.util.stream.Collectors;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
@ -35,16 +25,19 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkDedupTest implements Serializable {

File diff suppressed because one or more lines are too long