forked from D-Net/dnet-hadoop
implemented DedupRecord factory with the merge of project
This commit is contained in:
parent
6b45e37e22
commit
39367676d7
|
@ -19,7 +19,7 @@ public class Dataset extends Result implements Serializable {
|
||||||
|
|
||||||
private List<GeoLocation> geolocation;
|
private List<GeoLocation> geolocation;
|
||||||
|
|
||||||
public Field<String> getStoragedate() {
|
public Field<String> getStoragedate() {
|
||||||
return storagedate;
|
return storagedate;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,4 +74,26 @@ public class Dataset extends Result implements Serializable {
|
||||||
public void setGeolocation(List<GeoLocation> geolocation) {
|
public void setGeolocation(List<GeoLocation> geolocation) {
|
||||||
this.geolocation = geolocation;
|
this.geolocation = geolocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
final Dataset d = (Dataset) e;
|
||||||
|
|
||||||
|
storagedate = d.getStoragedate() != null && compareTrust(this, e)<0? d.getStoragedate() : storagedate;
|
||||||
|
|
||||||
|
device= d.getDevice() != null && compareTrust(this, e)<0? d.getDevice() : device;
|
||||||
|
|
||||||
|
size= d.getSize() != null && compareTrust(this, e)<0? d.getSize() : size;
|
||||||
|
|
||||||
|
version= d.getVersion() != null && compareTrust(this, e)<0? d.getVersion() : version;
|
||||||
|
|
||||||
|
lastmetadataupdate= d.getLastmetadataupdate() != null && compareTrust(this, e)<0? d.getLastmetadataupdate() :lastmetadataupdate;
|
||||||
|
|
||||||
|
metadataversionnumber= d.getMetadataversionnumber() != null && compareTrust(this, e)<0? d.getMetadataversionnumber() : metadataversionnumber;
|
||||||
|
|
||||||
|
geolocation = mergeLists(geolocation, d.getGeolocation());
|
||||||
|
|
||||||
|
mergeOAFDataInfo(d);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,4 +33,29 @@ public class GeoLocation implements Serializable {
|
||||||
public void setPlace(String place) {
|
public void setPlace(String place) {
|
||||||
this.place = place;
|
this.place = place;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public String toComparableString() {
|
||||||
|
return String.format("%s::%s%s", point != null ? point.toLowerCase() : "", box != null ? box.toLowerCase() : "",place != null ? place.toLowerCase() : "");
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int hashCode() {
|
||||||
|
return toComparableString().hashCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public boolean equals(Object obj) {
|
||||||
|
if (this == obj)
|
||||||
|
return true;
|
||||||
|
if (obj == null)
|
||||||
|
return false;
|
||||||
|
if (getClass() != obj.getClass())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
GeoLocation other = (GeoLocation) obj;
|
||||||
|
|
||||||
|
return toComparableString()
|
||||||
|
.equals(other.toComparableString());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,4 +23,24 @@ public abstract class Oaf implements Serializable {
|
||||||
public void setLastupdatetimestamp(Long lastupdatetimestamp) {
|
public void setLastupdatetimestamp(Long lastupdatetimestamp) {
|
||||||
this.lastupdatetimestamp = lastupdatetimestamp;
|
this.lastupdatetimestamp = lastupdatetimestamp;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public void mergeOAFDataInfo(Oaf e) {
|
||||||
|
if (e.getDataInfo()!= null && compareTrust(this,e)<0)
|
||||||
|
dataInfo = e.getDataInfo();
|
||||||
|
}
|
||||||
|
|
||||||
|
protected String extractTrust(Oaf e) {
|
||||||
|
if (e == null || e.getDataInfo()== null || e.getDataInfo().getTrust()== null)
|
||||||
|
return "0.0";
|
||||||
|
return e.getDataInfo().getTrust();
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
protected int compareTrust(Oaf a, Oaf b) {
|
||||||
|
return extractTrust(a).compareTo(extractTrust(b));
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -86,6 +86,7 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
this.oaiprovenance = oaiprovenance;
|
this.oaiprovenance = oaiprovenance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public void mergeFrom(OafEntity e) {
|
public void mergeFrom(OafEntity e) {
|
||||||
|
|
||||||
if (e == null)
|
if (e == null)
|
||||||
|
@ -97,13 +98,16 @@ public abstract class OafEntity extends Oaf implements Serializable {
|
||||||
|
|
||||||
pid = mergeLists(pid, e.getPid());
|
pid = mergeLists(pid, e.getPid());
|
||||||
|
|
||||||
dateofcollection = e.getDateofcollection();
|
if (e.getDateofcollection() != null && compareTrust(this, e) < 0)
|
||||||
|
dateofcollection = e.getDateofcollection();
|
||||||
|
|
||||||
dateoftransformation = e.getDateoftransformation();
|
if (e.getDateoftransformation() != null && compareTrust(this, e) < 0)
|
||||||
|
dateoftransformation = e.getDateoftransformation();
|
||||||
|
|
||||||
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
|
extraInfo = mergeLists(extraInfo, e.getExtraInfo());
|
||||||
|
|
||||||
oaiprovenance = e.getOaiprovenance();
|
if (e.getOaiprovenance() != null && compareTrust(this, e) < 0)
|
||||||
|
oaiprovenance = e.getOaiprovenance();
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -170,21 +170,22 @@ public class Organization extends OafEntity implements Serializable {
|
||||||
public void mergeFrom(OafEntity e) {
|
public void mergeFrom(OafEntity e) {
|
||||||
super.mergeFrom(e);
|
super.mergeFrom(e);
|
||||||
final Organization o = (Organization) e;
|
final Organization o = (Organization) e;
|
||||||
legalshortname = o.getLegalshortname() != null ? o.getLegalshortname() : legalshortname;
|
legalshortname = o.getLegalshortname() != null && compareTrust(this, e)<0? o.getLegalshortname() : legalshortname;
|
||||||
legalname = o.getLegalname() != null ? o.getLegalname() : legalname;
|
legalname = o.getLegalname() != null && compareTrust(this, e)<0 ? o.getLegalname() : legalname;
|
||||||
alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames);
|
alternativeNames = mergeLists(o.getAlternativeNames(), alternativeNames);
|
||||||
websiteurl = o.getWebsiteurl() != null ? o.getWebsiteurl() : websiteurl;
|
websiteurl = o.getWebsiteurl() != null && compareTrust(this, e)<0? o.getWebsiteurl() : websiteurl;
|
||||||
logourl = o.getLogourl() != null ? o.getLogourl() : logourl;
|
logourl = o.getLogourl() != null && compareTrust(this, e)<0? o.getLogourl() : logourl;
|
||||||
eclegalbody = o.getEclegalbody() != null ? o.getEclegalbody() : eclegalbody;
|
eclegalbody = o.getEclegalbody() != null && compareTrust(this, e)<0? o.getEclegalbody() : eclegalbody;
|
||||||
eclegalperson = o.getEclegalperson() != null ? o.getEclegalperson() : eclegalperson;
|
eclegalperson = o.getEclegalperson() != null && compareTrust(this, e)<0? o.getEclegalperson() : eclegalperson;
|
||||||
ecnonprofit = o.getEcnonprofit() != null ? o.getEcnonprofit() : ecnonprofit;
|
ecnonprofit = o.getEcnonprofit() != null && compareTrust(this, e)<0? o.getEcnonprofit() : ecnonprofit;
|
||||||
ecresearchorganization = o.getEcresearchorganization() != null ? o.getEcresearchorganization() : ecresearchorganization;
|
ecresearchorganization = o.getEcresearchorganization() != null && compareTrust(this, e)<0? o.getEcresearchorganization() : ecresearchorganization;
|
||||||
echighereducation = o.getEchighereducation() != null ? o.getEchighereducation() : echighereducation;
|
echighereducation = o.getEchighereducation() != null && compareTrust(this, e)<0? o.getEchighereducation() : echighereducation;
|
||||||
ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null ? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests;
|
ecinternationalorganizationeurinterests = o.getEcinternationalorganizationeurinterests() != null && compareTrust(this, e)<0? o.getEcinternationalorganizationeurinterests() : ecinternationalorganizationeurinterests;
|
||||||
ecinternationalorganization = o.getEcinternationalorganization() != null ? o.getEcinternationalorganization() : ecinternationalorganization;
|
ecinternationalorganization = o.getEcinternationalorganization() != null && compareTrust(this, e)<0? o.getEcinternationalorganization() : ecinternationalorganization;
|
||||||
ecenterprise = o.getEcenterprise() != null ? o.getEcenterprise() :ecenterprise;
|
ecenterprise = o.getEcenterprise() != null && compareTrust(this, e)<0? o.getEcenterprise() :ecenterprise;
|
||||||
ecsmevalidated = o.getEcsmevalidated() != null ? o.getEcsmevalidated() :ecsmevalidated;
|
ecsmevalidated = o.getEcsmevalidated() != null && compareTrust(this, e)<0? o.getEcsmevalidated() :ecsmevalidated;
|
||||||
ecnutscode = o.getEcnutscode() != null ? o.getEcnutscode() :ecnutscode;
|
ecnutscode = o.getEcnutscode() != null && compareTrust(this, e)<0? o.getEcnutscode() :ecnutscode;
|
||||||
country = o.getCountry() != null ? o.getCountry() :country;
|
country = o.getCountry() != null && compareTrust(this, e)<0 ? o.getCountry() :country;
|
||||||
|
mergeOAFDataInfo(o);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -264,4 +264,39 @@ public class Project extends OafEntity implements Serializable {
|
||||||
public void setFundedamount(Float fundedamount) {
|
public void setFundedamount(Float fundedamount) {
|
||||||
this.fundedamount = fundedamount;
|
this.fundedamount = fundedamount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void mergeFrom(OafEntity e) {
|
||||||
|
super.mergeFrom(e);
|
||||||
|
Project p = (Project)e;
|
||||||
|
|
||||||
|
websiteurl= p.getWebsiteurl()!= null && compareTrust(this,e)<0?p.getWebsiteurl():websiteurl;
|
||||||
|
code= p.getCode()!=null && compareTrust(this,e)<0?p.getCode():code;
|
||||||
|
acronym= p.getAcronym()!= null && compareTrust(this,e)<0?p.getAcronym():acronym;
|
||||||
|
title= p.getTitle()!= null && compareTrust(this,e)<0?p.getTitle():title;
|
||||||
|
startdate= p.getStartdate()!=null && compareTrust(this,e)<0?p.getStartdate():startdate;
|
||||||
|
enddate= p.getEnddate()!=null && compareTrust(this,e)<0?p.getEnddate():enddate;
|
||||||
|
callidentifier= p.getCallidentifier()!=null && compareTrust(this,e)<0?p.getCallidentifier():callidentifier;
|
||||||
|
keywords= p.getKeywords()!=null && compareTrust(this,e)<0?p.getKeywords():keywords;
|
||||||
|
duration= p.getDuration()!=null && compareTrust(this,e)<0?p.getDuration():duration;
|
||||||
|
ecsc39= p.getEcsc39()!=null && compareTrust(this,e)<0?p.getEcsc39():ecsc39;
|
||||||
|
oamandatepublications= p.getOamandatepublications()!=null && compareTrust(this,e)<0?p.getOamandatepublications():oamandatepublications;
|
||||||
|
ecarticle29_3= p.getEcarticle29_3()!=null && compareTrust(this,e)<0?p.getEcarticle29_3():ecarticle29_3;
|
||||||
|
subjects= mergeLists(subjects, p.getSubjects());
|
||||||
|
fundingtree= mergeLists(fundingtree, p.getFundingtree());
|
||||||
|
contracttype= p.getContracttype()!=null && compareTrust(this,e)<0?p.getContracttype():contracttype;
|
||||||
|
optional1= p.getOptional1()!=null && compareTrust(this,e)<0?p.getOptional1():optional1;
|
||||||
|
optional2= p.getOptional2()!=null && compareTrust(this,e)<0?p.getOptional2():optional2;
|
||||||
|
jsonextrainfo= p.getJsonextrainfo()!=null && compareTrust(this,e)<0?p.getJsonextrainfo():jsonextrainfo;
|
||||||
|
contactfullname= p.getContactfullname()!=null && compareTrust(this,e)<0?p.getContactfullname():contactfullname;
|
||||||
|
contactfax= p.getContactfax()!=null && compareTrust(this,e)<0?p.getContactfax():contactfax;
|
||||||
|
contactphone= p.getContactphone()!=null && compareTrust(this,e)<0?p.getContactphone():contactphone;
|
||||||
|
contactemail= p.getContactemail()!=null && compareTrust(this,e)<0?p.getContactemail():contactemail;
|
||||||
|
summary= p.getSummary()!=null && compareTrust(this,e)<0?p.getSummary():summary;
|
||||||
|
currency= p.getCurrency()!=null && compareTrust(this,e)<0?p.getCurrency():currency;
|
||||||
|
totalcost= p.getTotalcost()!=null && compareTrust(this,e)<0?p.getTotalcost():totalcost;
|
||||||
|
fundedamount= p.getFundedamount()!= null && compareTrust(this,e)<0?p.getFundedamount():fundedamount;
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,8 +21,9 @@ public class Publication extends Result implements Serializable {
|
||||||
|
|
||||||
Publication p = (Publication) e;
|
Publication p = (Publication) e;
|
||||||
|
|
||||||
if (p.getJournal() != null)
|
if (p.getJournal() != null && compareTrust(this, e)<0)
|
||||||
journal = p.getJournal();
|
journal = p.getJournal();
|
||||||
|
mergeOAFDataInfo(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -252,10 +252,10 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
instance = mergeLists(instance, r.getInstance());
|
instance = mergeLists(instance, r.getInstance());
|
||||||
|
|
||||||
if (r.getResulttype() != null)
|
if (r.getResulttype() != null && compareTrust(this, r)<0)
|
||||||
resulttype = r.getResulttype();
|
resulttype = r.getResulttype();
|
||||||
|
|
||||||
if (r.getLanguage() != null)
|
if (r.getLanguage() != null && compareTrust(this, r)<0)
|
||||||
language = r.getLanguage();
|
language = r.getLanguage();
|
||||||
|
|
||||||
country = mergeLists(country, r.getCountry());
|
country = mergeLists(country, r.getCountry());
|
||||||
|
@ -268,10 +268,10 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
description = longestLists(description, r.getDescription());
|
description = longestLists(description, r.getDescription());
|
||||||
|
|
||||||
if (r.getPublisher() != null)
|
if (r.getPublisher() != null && compareTrust(this, r)<0)
|
||||||
publisher = r.getPublisher();
|
publisher = r.getPublisher();
|
||||||
|
|
||||||
if (r.getEmbargoenddate() != null)
|
if (r.getEmbargoenddate() != null && compareTrust(this, r)<0)
|
||||||
embargoenddate = r.getEmbargoenddate();
|
embargoenddate = r.getEmbargoenddate();
|
||||||
|
|
||||||
source = mergeLists(source, r.getSource());
|
source = mergeLists(source, r.getSource());
|
||||||
|
@ -287,21 +287,23 @@ public abstract class Result extends OafEntity implements Serializable {
|
||||||
|
|
||||||
coverage = mergeLists(coverage, r.getCoverage());
|
coverage = mergeLists(coverage, r.getCoverage());
|
||||||
|
|
||||||
if (r.getRefereed() != null)
|
if (r.getRefereed() != null && compareTrust(this, r)<0)
|
||||||
refereed = r.getRefereed();
|
refereed = r.getRefereed();
|
||||||
|
|
||||||
context = mergeLists(context, r.getContext());
|
context = mergeLists(context, r.getContext());
|
||||||
|
|
||||||
if (r.getProcessingchargeamount() != null)
|
if (r.getProcessingchargeamount() != null && compareTrust(this, r)<0)
|
||||||
processingchargeamount = r.getProcessingchargeamount();
|
processingchargeamount = r.getProcessingchargeamount();
|
||||||
|
|
||||||
if (r.getProcessingchargecurrency() != null)
|
if (r.getProcessingchargecurrency() != null && compareTrust(this, r)<0)
|
||||||
processingchargecurrency = r.getProcessingchargecurrency();
|
processingchargecurrency = r.getProcessingchargecurrency();
|
||||||
|
|
||||||
externalReference = mergeLists(externalReference, r.getExternalReference());
|
externalReference = mergeLists(externalReference, r.getExternalReference());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
private List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
|
private List<Field<String>> longestLists(List<Field<String>> a, List<Field<String>> b) {
|
||||||
if(a == null || b == null)
|
if(a == null || b == null)
|
||||||
return a==null?b:a;
|
return a==null?b:a;
|
||||||
|
|
|
@ -24,6 +24,8 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
|
public static JavaRDD<OafEntity> createDedupRecord(final JavaSparkContext sc, final SparkSession spark, final String mergeRelsInputPath, final String entitiesInputPath, final OafEntityType entityType, final DedupConfig dedupConf) {
|
||||||
|
|
||||||
|
|
||||||
|
long ts = System.currentTimeMillis();
|
||||||
//<id, json_entity>
|
//<id, json_entity>
|
||||||
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
final JavaPairRDD<String, String> inputJsonEntities = sc.textFile(entitiesInputPath)
|
||||||
.mapToPair((PairFunction<String, String, String>) it ->
|
.mapToPair((PairFunction<String, String, String>) it ->
|
||||||
|
@ -43,57 +45,31 @@ public class DedupRecordFactory {
|
||||||
//<dedup_id, json_entity_merged>
|
//<dedup_id, json_entity_merged>
|
||||||
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
final JavaPairRDD<String, String> joinResult = mergeRels.join(inputJsonEntities).mapToPair((PairFunction<Tuple2<String, Tuple2<String, String>>, String, String>) Tuple2::_2);
|
||||||
|
|
||||||
JavaPairRDD<OafKey, String> keyJson = joinResult.mapToPair((PairFunction<Tuple2<String, String>, OafKey, String>) json -> {
|
JavaPairRDD<String, Iterable<String>> sortedJoinResult = joinResult.groupByKey();
|
||||||
|
|
||||||
String idValue = json._1();
|
|
||||||
|
|
||||||
String trust = "";
|
|
||||||
try {
|
|
||||||
trust = MapDocumentUtil.getJPathString("$.dataInfo.trust", json._2());
|
|
||||||
} catch (Throwable e) {
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
//TODO remember to replace this with the actual trust retrieving
|
|
||||||
if (StringUtils.isBlank(trust)) {
|
|
||||||
Random generator = new Random();
|
|
||||||
int number = generator.nextInt(20);
|
|
||||||
double result = (number / 100.0) + 0.80;
|
|
||||||
trust = "" + result;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new Tuple2<OafKey, String>(new OafKey(idValue, trust), json._2());
|
|
||||||
});
|
|
||||||
|
|
||||||
OafComparator c = new OafComparator();
|
|
||||||
//<dedup_id, mergedRecordsSortedByTrust>
|
|
||||||
JavaPairRDD<String, Iterable<String>> sortedJoinResult = keyJson.repartitionAndSortWithinPartitions(new OafPartitioner(keyJson.getNumPartitions()), c)
|
|
||||||
.mapToPair((PairFunction<Tuple2<OafKey, String>, String, String>) t -> new Tuple2<String, String>(t._1().getDedupId(), t._2()))
|
|
||||||
.groupByKey();
|
|
||||||
|
|
||||||
|
|
||||||
switch (entityType) {
|
switch (entityType) {
|
||||||
case publication:
|
case publication:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::publicationMerger);
|
|
||||||
|
return sortedJoinResult.map(p->DedupRecordFactory.publicationMerger(p, ts));
|
||||||
case dataset:
|
case dataset:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::datasetMerger);
|
return sortedJoinResult.map(d->DedupRecordFactory.datasetMerger(d,ts));
|
||||||
case project:
|
case project:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::projectMerger);
|
return sortedJoinResult.map(p->DedupRecordFactory.projectMerger(p,ts));
|
||||||
case software:
|
case software:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::softwareMerger);
|
return sortedJoinResult.map(s->DedupRecordFactory.softwareMerger(s,ts));
|
||||||
case datasource:
|
case datasource:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::datasourceMerger);
|
return sortedJoinResult.map(d->DedupRecordFactory.datasourceMerger(d,ts));
|
||||||
case organization:
|
case organization:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::organizationMerger);
|
return sortedJoinResult.map(o->DedupRecordFactory.organizationMerger(o,ts));
|
||||||
case otherresearchproduct:
|
case otherresearchproduct:
|
||||||
return sortedJoinResult.map(DedupRecordFactory::otherresearchproductMerger);
|
return sortedJoinResult.map(o->DedupRecordFactory.otherresearchproductMerger(o,ts));
|
||||||
default:
|
default:
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e) {
|
private static Publication publicationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
Publication p = new Publication(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
@ -103,19 +79,11 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
final Collection<String> dateofacceptance = Lists.newArrayList();
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
|
||||||
|
|
||||||
StringBuilder trust = new StringBuilder("0.0");
|
|
||||||
|
|
||||||
if (e._2() != null)
|
if (e._2() != null)
|
||||||
e._2().forEach(pub -> {
|
e._2().forEach(pub -> {
|
||||||
try {
|
try {
|
||||||
Publication publication = mapper.readValue(pub, Publication.class);
|
Publication publication = mapper.readValue(pub, Publication.class);
|
||||||
|
|
||||||
final String currentTrust = publication.getDataInfo().getTrust();
|
|
||||||
if (!"1.0".equals(currentTrust)) {
|
|
||||||
trust.setLength(0);
|
|
||||||
trust.append(currentTrust);
|
|
||||||
}
|
|
||||||
p.mergeFrom(publication);
|
p.mergeFrom(publication);
|
||||||
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
|
p.setAuthor(DedupUtility.mergeAuthor(p.getAuthor(), publication.getAuthor()));
|
||||||
//add to the list if they are not null
|
//add to the list if they are not null
|
||||||
|
@ -126,30 +94,74 @@ public class DedupRecordFactory {
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
p.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
p.getDataInfo().setTrust("0.9");
|
||||||
|
p.setLastupdatetimestamp(ts);
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e) {
|
private static Dataset datasetMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Dataset d = new Dataset(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
d.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final Collection<String> dateofacceptance = Lists.newArrayList();
|
||||||
|
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(dat -> {
|
||||||
|
try {
|
||||||
|
Dataset dataset = mapper.readValue(dat, Dataset.class);
|
||||||
|
|
||||||
|
d.mergeFrom(dataset);
|
||||||
|
d.setAuthor(DedupUtility.mergeAuthor(d.getAuthor(), dataset.getAuthor()));
|
||||||
|
//add to the list if they are not null
|
||||||
|
if (dataset.getDateofacceptance() != null)
|
||||||
|
dateofacceptance.add(dataset.getDateofacceptance().getValue());
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
d.setDateofacceptance(DatePicker.pick(dateofacceptance));
|
||||||
|
d.getDataInfo().setTrust("0.9");
|
||||||
|
d.setLastupdatetimestamp(ts);
|
||||||
|
return d;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Project projectMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
|
Project p = new Project(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
p.setId(e._1());
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
if (e._2() != null)
|
||||||
|
e._2().forEach(proj -> {
|
||||||
|
try {
|
||||||
|
Project project = mapper.readValue(proj, Project.class);
|
||||||
|
|
||||||
|
p.mergeFrom(project);
|
||||||
|
} catch (Exception exc) {
|
||||||
|
throw new RuntimeException(exc);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
p.getDataInfo().setTrust("0.9");
|
||||||
|
p.setLastupdatetimestamp(ts);
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Software softwareMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Project projectMerger(Tuple2<String, Iterable<String>> e) {
|
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Software softwareMerger(Tuple2<String, Iterable<String>> e) {
|
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
throw new NotImplementedException();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Datasource datasourceMerger(Tuple2<String, Iterable<String>> e) {
|
|
||||||
|
|
||||||
throw new NotImplementedException();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Organization organizationMerger(Tuple2<String, Iterable<String>> e) {
|
|
||||||
|
|
||||||
Organization o = new Organization(); //the result of the merge, to be returned at the end
|
Organization o = new Organization(); //the result of the merge, to be returned at the end
|
||||||
|
|
||||||
|
@ -180,7 +192,7 @@ public class DedupRecordFactory {
|
||||||
return o;
|
return o;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e) {
|
private static OtherResearchProduct otherresearchproductMerger(Tuple2<String, Iterable<String>> e, final long ts) {
|
||||||
|
|
||||||
throw new NotImplementedException();
|
throw new NotImplementedException();
|
||||||
}
|
}
|
||||||
|
|
|
@ -109,58 +109,26 @@ public class DedupUtility {
|
||||||
int sa = authorsSize(a);
|
int sa = authorsSize(a);
|
||||||
int sb = authorsSize(b);
|
int sb = authorsSize(b);
|
||||||
|
|
||||||
if(pa == pb){
|
if (pa == pb) {
|
||||||
base = sa>sb?a:b;
|
base = sa > sb ? a : b;
|
||||||
enrich = sa>sb?b:a;
|
enrich = sa > sb ? b : a;
|
||||||
} else {
|
} else {
|
||||||
base = pa>pb?a:b;
|
base = pa > pb ? a : b;
|
||||||
enrich = pa>pb?b:a;
|
enrich = pa > pb ? b : a;
|
||||||
}
|
}
|
||||||
enrichPidFromList(base, enrich);
|
enrichPidFromList(base, enrich);
|
||||||
return base;
|
return base;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// //if both have no authors with pids
|
|
||||||
// if (pa < 1 && pb < 1) {
|
|
||||||
// //B is bigger than A
|
|
||||||
// if (sa < sb)
|
|
||||||
// return b;
|
|
||||||
// //A is bigger than B
|
|
||||||
// else
|
|
||||||
// return a;
|
|
||||||
// }
|
|
||||||
// //If A has author with pids
|
|
||||||
// if (pa > 0) {
|
|
||||||
// //B has no author with pid
|
|
||||||
// if (pb < 1)
|
|
||||||
// return a;
|
|
||||||
// //B has author with pid
|
|
||||||
// else {
|
|
||||||
// enrichPidFromList(a, b);
|
|
||||||
// return a;
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// //If B has author with pids
|
|
||||||
// //A has no author with pid
|
|
||||||
// if (pa < 1)
|
|
||||||
// return b;
|
|
||||||
// //A has author with pid
|
|
||||||
// else {
|
|
||||||
// enrichPidFromList(b, a);
|
|
||||||
// return b;
|
|
||||||
// }
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
private static void enrichPidFromList(List<Author> base, List<Author> enrich) {
|
||||||
if(base==null || enrich == null)
|
if (base == null || enrich == null)
|
||||||
return;
|
return;
|
||||||
final Map<String, Author> basePidAuthorMap = base.stream()
|
final Map<String, Author> basePidAuthorMap = base.stream()
|
||||||
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
.filter(a -> a.getPid() != null && a.getPid().size() > 0)
|
||||||
.flatMap(a -> a.getPid()
|
.flatMap(a -> a.getPid()
|
||||||
.stream()
|
.stream()
|
||||||
.map(p -> new Tuple2<>(p.toComparableString(), a))
|
.map(p -> new Tuple2<>(p.toComparableString(), a))
|
||||||
).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2));
|
).collect(Collectors.toMap(Tuple2::_1, Tuple2::_2, (x1, x2) -> x1));
|
||||||
|
|
||||||
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
final List<Tuple2<StructuredProperty, Author>> pidToEnrich = enrich
|
||||||
.stream()
|
.stream()
|
||||||
|
@ -171,7 +139,7 @@ public class DedupUtility {
|
||||||
|
|
||||||
pidToEnrich.forEach(a -> {
|
pidToEnrich.forEach(a -> {
|
||||||
Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
|
Optional<Tuple2<Double, Author>> simAuhtor = base.stream().map(ba -> new Tuple2<>(sim(ba, a._2()), ba)).max(Comparator.comparing(Tuple2::_1));
|
||||||
if (simAuhtor.isPresent() && simAuhtor.get()._1()> THRESHOLD) {
|
if (simAuhtor.isPresent() && simAuhtor.get()._1() > THRESHOLD) {
|
||||||
Author r = simAuhtor.get()._2();
|
Author r = simAuhtor.get()._2();
|
||||||
r.getPid().add(a._1());
|
r.getPid().add(a._1());
|
||||||
}
|
}
|
||||||
|
@ -179,15 +147,15 @@ public class DedupUtility {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String createEntityPath(final String basePath, final String entityType) {
|
public static String createEntityPath(final String basePath, final String entityType) {
|
||||||
return String.format("%s/%s", basePath,entityType);
|
return String.format("%s/%s", basePath, entityType);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String createSimRelPath(final String basePath, final String entityType) {
|
public static String createSimRelPath(final String basePath, final String entityType) {
|
||||||
return String.format("%s/%s_simRel", basePath,entityType);
|
return String.format("%s/%s_simRel", basePath, entityType);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static String createMergeRelPath(final String basePath, final String entityType) {
|
public static String createMergeRelPath(final String basePath, final String entityType) {
|
||||||
return String.format("%s/%s_mergeRel", basePath,entityType);
|
return String.format("%s/%s_mergeRel", basePath, entityType);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Double sim(Author a, Author b) {
|
private static Double sim(Author a, Author b) {
|
||||||
|
@ -220,6 +188,7 @@ public class DedupUtility {
|
||||||
private static String nfd(final String s) {
|
private static String nfd(final String s) {
|
||||||
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
return Normalizer.normalize(s, Normalizer.Form.NFD);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Person parse(Author author) {
|
private static Person parse(Author author) {
|
||||||
if (StringUtils.isNotBlank(author.getSurname())) {
|
if (StringUtils.isNotBlank(author.getSurname())) {
|
||||||
return new Person(author.getSurname() + ", " + author.getName(), false);
|
return new Person(author.getSurname() + ", " + author.getName(), false);
|
||||||
|
@ -233,7 +202,7 @@ public class DedupUtility {
|
||||||
if (authors == null)
|
if (authors == null)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
return (int) authors.stream().map(DedupUtility::extractAuthorPid).filter(Objects::nonNull).filter(StringUtils::isNotBlank).count();
|
return (int) authors.stream().filter(DedupUtility::hasPid).count();
|
||||||
}
|
}
|
||||||
|
|
||||||
private static int authorsSize(List<Author> authors) {
|
private static int authorsSize(List<Author> authors) {
|
||||||
|
@ -242,29 +211,9 @@ public class DedupUtility {
|
||||||
return authors.size();
|
return authors.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean hasPid(Author a) {
|
||||||
private static boolean isAccurate(final Author a) {
|
|
||||||
return StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String extractAuthorPid(Author a) {
|
|
||||||
|
|
||||||
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
if (a == null || a.getPid() == null || a.getPid().size() == 0)
|
||||||
return null;
|
return false;
|
||||||
|
return a.getPid().stream().anyMatch(p -> p != null && StringUtils.isNotBlank(p.getValue()));
|
||||||
StringBuilder mainPid = new StringBuilder();
|
|
||||||
|
|
||||||
a.getPid().forEach(pid -> {
|
|
||||||
if (pid.getQualifier().getClassid().equalsIgnoreCase("orcid")) {
|
|
||||||
mainPid.setLength(0);
|
|
||||||
mainPid.append(pid.getValue());
|
|
||||||
} else {
|
|
||||||
if (mainPid.length() == 0)
|
|
||||||
mainPid.append(pid.getValue());
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return mainPid.toString();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,15 +0,0 @@
|
||||||
package eu.dnetlib.dedup;
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
public class OafComparator implements Comparator<OafKey>, Serializable {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(OafKey a, OafKey b) {
|
|
||||||
return ComparisonChain.start()
|
|
||||||
.compare(a.getDedupId(), b.getDedupId())
|
|
||||||
.compare(a.getTrust(), b.getTrust())
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,31 +0,0 @@
|
||||||
package eu.dnetlib.dedup;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
public class OafKey implements Serializable {
|
|
||||||
|
|
||||||
private String dedupId;
|
|
||||||
private String trust;
|
|
||||||
|
|
||||||
public OafKey(String dedupId, String trust) {
|
|
||||||
this.dedupId = dedupId;
|
|
||||||
this.trust = trust;
|
|
||||||
}
|
|
||||||
public OafKey() {
|
|
||||||
}
|
|
||||||
public String getDedupId() {
|
|
||||||
return dedupId;
|
|
||||||
}
|
|
||||||
public void setDedupId(String dedupId) {
|
|
||||||
this.dedupId = dedupId;
|
|
||||||
}
|
|
||||||
public String getTrust() {
|
|
||||||
return trust;
|
|
||||||
}
|
|
||||||
public void setTrust(String trust) {
|
|
||||||
this.trust = trust;
|
|
||||||
}
|
|
||||||
@Override
|
|
||||||
public String toString(){
|
|
||||||
return String.format("%s->%d", dedupId,trust);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,59 +0,0 @@
|
||||||
package eu.dnetlib.dedup;
|
|
||||||
|
|
||||||
import org.apache.spark.Partitioner;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
|
|
||||||
public class OafPartitioner extends Partitioner implements Serializable {
|
|
||||||
|
|
||||||
private final int numPartitions;
|
|
||||||
|
|
||||||
public OafPartitioner(int partitions) {
|
|
||||||
assert (partitions > 0);
|
|
||||||
this.numPartitions = partitions;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int numPartitions() {
|
|
||||||
return numPartitions;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int getPartition(Object key) {
|
|
||||||
if (key instanceof OafKey) {
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
OafKey item = (OafKey) key;
|
|
||||||
return Math.abs(item.getDedupId().hashCode() % numPartitions);
|
|
||||||
} else {
|
|
||||||
throw new IllegalArgumentException("Unexpected Key");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int hashCode() {
|
|
||||||
final int prime = 31;
|
|
||||||
int result = 1;
|
|
||||||
result = prime * result + numPartitions;
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean equals(Object obj) {
|
|
||||||
if (this == obj) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
if (obj == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (!(obj instanceof OafPartitioner)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//
|
|
||||||
OafPartitioner other = (OafPartitioner) obj;
|
|
||||||
if (numPartitions != other.numPartitions) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
//
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -30,7 +30,7 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="CreateSimRels"/>
|
<start to="CreateDedupRecord"/>
|
||||||
|
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
|
|
Loading…
Reference in New Issue