forked from D-Net/dnet-hadoop
implementation of the dedup_id generation using pids to make the graph more stable
This commit is contained in:
parent
105176105c
commit
b260fee787
|
@ -114,7 +114,7 @@ public class DatePicker {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private static boolean inRange(final String date) {
|
public static boolean inRange(final String date) {
|
||||||
final int year = Integer.parseInt(substringBefore(date, "-"));
|
final int year = Integer.parseInt(substringBefore(date, "-"));
|
||||||
return year >= YEAR_LB && year <= YEAR_UB;
|
return year >= YEAR_LB && year <= YEAR_UB;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import java.util.Collection;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.util.Iterator;
|
import com.google.common.collect.Lists;
|
||||||
import java.util.List;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -13,15 +14,12 @@ import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.*;
|
||||||
|
|
||||||
public class DedupRecordFactory {
|
public class DedupRecordFactory {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
|
private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
|
||||||
|
@ -77,15 +75,23 @@ public class DedupRecordFactory {
|
||||||
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
String id, Iterator<Tuple2<String, T>> entities, long ts, DataInfo dataInfo, Class<T> clazz)
|
||||||
throws IllegalAccessException, InstantiationException {
|
throws IllegalAccessException, InstantiationException {
|
||||||
|
|
||||||
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
|
|
||||||
T entity = clazz.newInstance();
|
T entity = clazz.newInstance();
|
||||||
|
|
||||||
final Collection<String> dates = Lists.newArrayList();
|
final Collection<String> dates = Lists.newArrayList();
|
||||||
final List<List<Author>> authors = Lists.newArrayList();
|
final List<List<Author>> authors = Lists.newArrayList();
|
||||||
|
final List<Identifier> bestPids = Lists.newArrayList(); //best pids list
|
||||||
|
|
||||||
entities
|
entities
|
||||||
.forEachRemaining(
|
.forEachRemaining(
|
||||||
t -> {
|
t -> {
|
||||||
T duplicate = t._2();
|
T duplicate = t._2();
|
||||||
|
|
||||||
|
StructuredProperty bestPid = bestPid(duplicate.getPid());
|
||||||
|
if (bestPid != null)
|
||||||
|
bestPids.add(new Identifier(bestPid, extractDate(duplicate, sdf), PidType.classidValueOf(bestPid.getQualifier().getClassid())));
|
||||||
|
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result r1 = (Result) duplicate;
|
Result r1 = (Result) duplicate;
|
||||||
|
@ -94,6 +100,7 @@ public class DedupRecordFactory {
|
||||||
if (r1.getDateofacceptance() != null)
|
if (r1.getDateofacceptance() != null)
|
||||||
dates.add(r1.getDateofacceptance().getValue());
|
dates.add(r1.getDateofacceptance().getValue());
|
||||||
}
|
}
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
// set authors and date
|
// set authors and date
|
||||||
|
@ -102,10 +109,73 @@ public class DedupRecordFactory {
|
||||||
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
((Result) entity).setAuthor(AuthorMerger.merge(authors));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Identifier bestPid = winnerPid(bestPids);
|
||||||
|
if (bestPid == null)
|
||||||
entity.setId(id);
|
entity.setId(id);
|
||||||
|
else
|
||||||
|
entity.setId(id.split("\\|")[0] + "|" + createPrefix(bestPid.getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bestPid.getPid().getValue()));
|
||||||
|
|
||||||
entity.setLastupdatetimestamp(ts);
|
entity.setLastupdatetimestamp(ts);
|
||||||
entity.setDataInfo(dataInfo);
|
entity.setDataInfo(dataInfo);
|
||||||
|
|
||||||
return entity;
|
return entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//pick the best pid from the list (consider date and pidtype)
|
||||||
|
public static Identifier winnerPid(List<Identifier> pids) {
|
||||||
|
if (pids == null || pids.size() == 0)
|
||||||
|
return null;
|
||||||
|
Optional<Identifier> bp = pids.stream()
|
||||||
|
.filter(pid -> pid.getType() != PidType.undefined)
|
||||||
|
.max(Identifier::compareTo);
|
||||||
|
return bp.orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
//pick the best pid from the entity
|
||||||
|
public static StructuredProperty bestPid(List<StructuredProperty> pids) {
|
||||||
|
|
||||||
|
if (pids == null || pids.size() == 0)
|
||||||
|
return null;
|
||||||
|
Optional<StructuredProperty> bp = pids.stream()
|
||||||
|
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
||||||
|
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
||||||
|
|
||||||
|
return bp.orElse(null);
|
||||||
|
}
|
||||||
|
|
||||||
|
//create the prefix (length = 12): dedup_+ pidType
|
||||||
|
public static String createPrefix(String pidType) {
|
||||||
|
|
||||||
|
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
||||||
|
|
||||||
|
while (prefix.length() < 12) {
|
||||||
|
prefix.append("_");
|
||||||
|
}
|
||||||
|
return prefix.toString().substring(0, 12);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01
|
||||||
|
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){
|
||||||
|
|
||||||
|
String date = "2000-01-01";
|
||||||
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
|
Result result = (Result) duplicate;
|
||||||
|
if (isWellformed(result.getDateofacceptance())){
|
||||||
|
date = result.getDateofacceptance().getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return sdf.parse(date);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isWellformed(Field<String> date) {
|
||||||
|
return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches("\\d{4}-\\d{2}-\\d{2}") && DatePicker.inRange(date.getValue());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,55 @@
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Date;
|
||||||
|
|
||||||
|
public class Identifier implements Serializable, Comparable<Identifier>{
|
||||||
|
|
||||||
|
StructuredProperty pid;
|
||||||
|
Date date;
|
||||||
|
PidType type;
|
||||||
|
|
||||||
|
public Identifier(StructuredProperty pid, Date date, PidType type) {
|
||||||
|
this.pid = pid;
|
||||||
|
this.date = date;
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public StructuredProperty getPid() {
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setPid(StructuredProperty pidValue) {
|
||||||
|
this.pid = pid;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Date getDate() {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDate(Date date) {
|
||||||
|
this.date = date;
|
||||||
|
}
|
||||||
|
|
||||||
|
public PidType getType() {
|
||||||
|
return type;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setType(PidType type) {
|
||||||
|
this.type = type;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compareTo(Identifier i) {
|
||||||
|
//priority in comparisons: 1) pidtype, 2) date
|
||||||
|
if (this.getType().compareTo(i.getType()) == 0){ //same type
|
||||||
|
return this.getDate().compareTo(date);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
return this.getType().compareTo(i.getType());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
public enum PidType {
|
||||||
|
|
||||||
|
//from the less to the more important
|
||||||
|
undefined,
|
||||||
|
orcid,
|
||||||
|
ror,
|
||||||
|
grid,
|
||||||
|
pdb,
|
||||||
|
arXiv,
|
||||||
|
pmid,
|
||||||
|
doi;
|
||||||
|
|
||||||
|
public static PidType classidValueOf(String s){
|
||||||
|
try {
|
||||||
|
return PidType.valueOf(s);
|
||||||
|
}
|
||||||
|
catch (Exception e) {
|
||||||
|
return PidType.undefined;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
//dnet:pid_types
|
||||||
|
//"actrn"
|
||||||
|
//"nct"
|
||||||
|
//"euctr"
|
||||||
|
//"epo_id"
|
||||||
|
//"gsk"
|
||||||
|
//"GeoPass"
|
||||||
|
//"GBIF"
|
||||||
|
//"isrctn"
|
||||||
|
//"ISNI"
|
||||||
|
//"jprn"
|
||||||
|
//"mag_id"
|
||||||
|
//"oai"
|
||||||
|
//"orcid"
|
||||||
|
//"PANGAEA"
|
||||||
|
//"epo_nr_epodoc"
|
||||||
|
//"UNKNOWN"
|
||||||
|
//"VIAF"
|
||||||
|
//"arXiv"
|
||||||
|
//"doi"
|
||||||
|
//"grid"
|
||||||
|
//"info:eu-repo/dai"
|
||||||
|
//"orcidworkid"
|
||||||
|
//"pmc"
|
||||||
|
//"pmid"
|
||||||
|
//"urn"
|
||||||
|
//"who"
|
||||||
|
//"drks"
|
||||||
|
//"pdb"
|
|
@ -22,10 +22,11 @@ public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
List<Tuple2<String, Publication>> publications;
|
List<Tuple2<String, Publication>> publications;
|
||||||
List<Tuple2<String, Publication>> publications2;
|
List<Tuple2<String, Publication>> publications2;
|
||||||
|
List<Tuple2<String, Publication>> publications3;
|
||||||
|
|
||||||
String testEntityBasePath;
|
String testEntityBasePath;
|
||||||
DataInfo dataInfo;
|
DataInfo dataInfo;
|
||||||
String dedupId = "dedup_id";
|
String dedupId = "00|dedup_id::1";
|
||||||
Publication pub_top;
|
Publication pub_top;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
|
@ -38,6 +39,8 @@ public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
publications = readSample(testEntityBasePath + "/publication_merge.json", Publication.class);
|
||||||
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
publications2 = readSample(testEntityBasePath + "/publication_merge2.json", Publication.class);
|
||||||
|
publications3 = readSample(testEntityBasePath + "/publication_merge3.json", Publication.class);
|
||||||
|
|
||||||
|
|
||||||
pub_top = getTopPub(publications);
|
pub_top = getTopPub(publications);
|
||||||
|
|
||||||
|
@ -54,6 +57,9 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||||
|
|
||||||
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
||||||
|
|
||||||
|
assertEquals(merged.getId(), "00|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -62,7 +68,8 @@ public class EntityMergerTest implements Serializable {
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
assertEquals(dedupId, pub_merged.getId());
|
// verify id
|
||||||
|
assertEquals(pub_merged.getId(), "00|dedup_doi___::0968af610a356656706657e4f234b340");
|
||||||
|
|
||||||
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
||||||
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
|
assertEquals(pub_merged.getBestaccessright(), pub_top.getBestaccessright());
|
||||||
|
@ -117,11 +124,25 @@ public class EntityMergerTest implements Serializable {
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
// verify id
|
||||||
|
assertEquals(pub_merged.getId(), "00|dedup_doi___::0ca46ff10b2b4c756191719d85302b14");
|
||||||
|
|
||||||
assertEquals(pub_merged.getAuthor().size(), 27);
|
assertEquals(pub_merged.getAuthor().size(), 27);
|
||||||
// insert assertions here
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
|
Publication pub_merged = DedupRecordFactory
|
||||||
|
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
|
// verify id
|
||||||
|
assertEquals(pub_merged.getId(), "00|dedup_doi___::0ca46ff10b2b4c756191719d85302b14");
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public DataInfo setDI() {
|
public DataInfo setDI() {
|
||||||
DataInfo dataInfo = new DataInfo();
|
DataInfo dataInfo = new DataInfo();
|
||||||
dataInfo.setTrust("0.9");
|
dataInfo.setTrust("0.9");
|
||||||
|
|
|
@ -1,22 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import static java.nio.file.Files.createTempDirectory;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import static org.apache.spark.sql.functions.col;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import static org.apache.spark.sql.functions.count;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import static org.mockito.Mockito.lenient;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -35,16 +25,19 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
|
import static org.apache.spark.sql.functions.count;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class SparkDedupTest implements Serializable {
|
public class SparkDedupTest implements Serializable {
|
||||||
|
|
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue