minor changes in dedup tests, bug fix in the idgenerator and pace-core version update

This commit is contained in:
miconis 2020-09-29 15:31:46 +02:00
parent 4cf79f32eb
commit e3f7798d1b
16 changed files with 751 additions and 669 deletions

View File

@ -90,6 +90,10 @@
<groupId>com.fasterxml.jackson.core</groupId> <groupId>com.fasterxml.jackson.core</groupId>
<artifactId>jackson-core</artifactId> <artifactId>jackson-core</artifactId>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
</dependency>
</dependencies> </dependencies>

View File

@ -29,6 +29,7 @@ import eu.dnetlib.pace.config.DedupConfig;
abstract class AbstractSparkAction implements Serializable { abstract class AbstractSparkAction implements Serializable {
protected static final int NUM_PARTITIONS = 1000; protected static final int NUM_PARTITIONS = 1000;
protected static final int NUM_CONNECTIONS = 20;
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);

View File

@ -1,12 +1,10 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.DeserializationFeature; import java.text.ParseException;
import com.fasterxml.jackson.databind.ObjectMapper; import java.text.SimpleDateFormat;
import com.google.common.collect.Lists; import java.util.*;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang.StringUtils; import org.apache.commons.lang.StringUtils;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
@ -15,11 +13,15 @@ import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.text.ParseException; import com.fasterxml.jackson.databind.DeserializationFeature;
import java.text.SimpleDateFormat; import com.fasterxml.jackson.databind.ObjectMapper;
import java.util.*; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import scala.Tuple2;
public class DedupRecordFactory { public class DedupRecordFactory {
@ -80,14 +82,14 @@ public class DedupRecordFactory {
final Collection<String> dates = Lists.newArrayList(); final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList(); final List<List<Author>> authors = Lists.newArrayList();
final List<Identifier> bestPids = Lists.newArrayList(); //best pids list final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
entities entities
.forEachRemaining( .forEachRemaining(
t -> { t -> {
T duplicate = t._2(); T duplicate = t._2();
//prepare the list of pids to use for the id generation // prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate)); bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
entity.mergeFrom(duplicate); entity.mergeFrom(duplicate);
@ -115,5 +117,4 @@ public class DedupRecordFactory {
return entity; return entity;
} }
} }

View File

@ -1,90 +1,112 @@
package eu.dnetlib.dhp.oa.dedup;
import com.google.common.collect.Lists; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang.NullArgumentException;
import org.apache.commons.lang.StringUtils;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException; import java.text.ParseException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.*; import java.util.*;
import org.apache.commons.lang.NullArgumentException;
import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class IdGenerator implements Serializable { public class IdGenerator implements Serializable {
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd"); public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2"; public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
//pick the best pid from the list (consider date and pidtype) // pick the best pid from the list (consider date and pidtype)
public static String generate(List<Identifier> pids, String defaultID) { public static String generate(List<Identifier> pids, String defaultID) {
if (pids == null || pids.size() == 0) if (pids == null || pids.size() == 0)
return defaultID; return defaultID;
Optional<Identifier> bp = pids.stream() Optional<Identifier> bp = pids
.max(Identifier::compareTo); .stream()
.max(Identifier::compareTo);
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) { if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" + DedupUtility.md5(bp.get().getOriginalID()); return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
} else { + DedupUtility.md5(bp.get().getOriginalID());
return bp.get().getOriginalID().split("\\|")[0] + "|" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bp.get().getPid().getValue()); } else {
} return bp.get().getOriginalID().split("\\|")[0] + "|"
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
+ DedupUtility.md5(bp.get().getPid().getValue());
}
} }
//pick the best pid from the entity. Returns a list (length 1) to save time in the call // pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) { public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
if (entity.getPid() == null || entity.getPid().size() == 0) if (entity.getPid() == null || entity.getPid().size() == 0)
return Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())); return Lists
.newArrayList(
new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
Optional<StructuredProperty> bp = entity.getPid().stream() Optional<StructuredProperty> bp = entity
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined) .getPid()
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()))); .stream()
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
return bp.map(structuredProperty -> return bp
Lists.newArrayList(new Identifier(structuredProperty, extractDate(entity, sdf), PidType.classidValueOf(structuredProperty.getQualifier().getClassid()), entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())) .map(
).orElseGet(() -> Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()))); structuredProperty -> Lists
.newArrayList(
new Identifier(structuredProperty, extractDate(entity, new SimpleDateFormat("yyyy-MM-dd")),
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
.orElseGet(
() -> Lists
.newArrayList(
new Identifier(new StructuredProperty(), new Date(), PidType.original,
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
} }
//create the prefix (length = 12): dedup_+ pidType // create the prefix (length = 12): dedup_+ pidType
public static String createPrefix(String pidType) { public static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder("dedup_" + pidType); StringBuilder prefix = new StringBuilder("dedup_" + pidType);
while (prefix.length() < 12) { while (prefix.length() < 12) {
prefix.append("_"); prefix.append("_");
} }
return prefix.toString().substring(0, 12); return prefix.toString().substring(0, 12);
} }
//extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01 // extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){ // 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
String date = "2000-01-01"; String date = "2000-01-01";
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate; Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())){ if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue(); date = result.getDateofacceptance().getValue();
} }
} }
try { try {
return sdf.parse(date); return sdf.parse(date);
} catch (ParseException e) { } catch (ParseException e) {
return new Date(); return new Date();
} }
} }
public static boolean isWellformed(Field<String> date) { public static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue()); return date != null && StringUtils.isNotBlank(date.getValue())
} && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
}
} }

View File

@ -1,132 +1,138 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.common.EntityType; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.io.Serializable; import java.io.Serializable;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
public class Identifier implements Serializable, Comparable<Identifier>{ import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
StructuredProperty pid; public class Identifier implements Serializable, Comparable<Identifier> {
Date date;
PidType type;
List<KeyValue> collectedFrom;
EntityType entityType;
String originalID;
boolean useOriginal = false; //to know if the top identifier won because of the alphabetical order of the original ID StructuredProperty pid;
Date date;
PidType type;
List<KeyValue> collectedFrom;
EntityType entityType;
String originalID;
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, EntityType entityType, String originalID) { boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
this.pid = pid; // ID
this.date = date;
this.type = type;
this.collectedFrom = collectedFrom;
this.entityType = entityType;
this.originalID = originalID;
}
public StructuredProperty getPid() { public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
return pid; EntityType entityType, String originalID) {
} this.pid = pid;
this.date = date;
this.type = type;
this.collectedFrom = collectedFrom;
this.entityType = entityType;
this.originalID = originalID;
}
public void setPid(StructuredProperty pidValue) { public StructuredProperty getPid() {
this.pid = pid; return pid;
} }
public Date getDate() { public void setPid(StructuredProperty pidValue) {
return date; this.pid = pid;
} }
public void setDate(Date date) { public Date getDate() {
this.date = date; return date;
} }
public PidType getType() { public void setDate(Date date) {
return type; this.date = date;
} }
public void setType(PidType type) { public PidType getType() {
this.type = type; return type;
} }
public List<KeyValue> getCollectedFrom() { public void setType(PidType type) {
return collectedFrom; this.type = type;
} }
public void setCollectedFrom(List<KeyValue> collectedFrom) { public List<KeyValue> getCollectedFrom() {
this.collectedFrom = collectedFrom; return collectedFrom;
} }
public EntityType getEntityType() { public void setCollectedFrom(List<KeyValue> collectedFrom) {
return entityType; this.collectedFrom = collectedFrom;
} }
public void setEntityType(EntityType entityType) { public EntityType getEntityType() {
this.entityType = entityType; return entityType;
} }
public String getOriginalID() { public void setEntityType(EntityType entityType) {
return originalID; this.entityType = entityType;
} }
public void setOriginalID(String originalID) { public String getOriginalID() {
this.originalID = originalID; return originalID;
} }
public boolean isUseOriginal() { public void setOriginalID(String originalID) {
return useOriginal; this.originalID = originalID;
} }
public void setUseOriginal(boolean useOriginal) { public boolean isUseOriginal() {
this.useOriginal = useOriginal; return useOriginal;
} }
@Override public void setUseOriginal(boolean useOriginal) {
public int compareTo(Identifier i) { this.useOriginal = useOriginal;
//priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) alphabetical order of the originalID }
if (this.getType().compareTo(i.getType()) == 0){ //same type
if (entityType == EntityType.publication) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
return -1;
}
if (entityType == EntityType.dataset) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
return -1;
}
if (this.getDate().compareTo(date) == 0) {//same date @Override
public int compareTo(Identifier i) {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID
if (this.getType().compareTo(i.getType()) == 0) { // same type
if (entityType == EntityType.publication) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
return -1;
}
if (entityType == EntityType.dataset) {
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
return 1;
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
return -1;
}
if (this.originalID.compareTo(i.originalID) > 0) if (this.getDate().compareTo(date) == 0) {// same date
this.useOriginal = true;
else
i.setUseOriginal(true);
//the minus because we need to take the alphabetically lower id if (this.originalID.compareTo(i.originalID) > 0)
return -this.originalID.compareTo(i.originalID); this.useOriginal = true;
} else
else i.setUseOriginal(true);
//the minus is because we need to take the elder date
return -this.getDate().compareTo(date);
}
else {
return this.getType().compareTo(i.getType());
}
} // the minus because we need to take the alphabetically lower id
return -this.originalID.compareTo(i.originalID);
} else
// the minus is because we need to take the elder date
return -this.getDate().compareTo(date);
} else {
return this.getType().compareTo(i.getType());
}
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId){ }
for(KeyValue cf: collectedFrom) { public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) {
if(cf.getKey().equals(dsId))
return true; for (KeyValue cf : collectedFrom) {
} if (cf.getKey().equals(dsId))
return false; return true;
} }
return false;
}
} }

View File

@ -1,83 +1,98 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import java.io.Serializable; import java.io.Serializable;
public class OrgSimRel implements Serializable { public class OrgSimRel implements Serializable {
String local_id; String local_id;
String oa_original_id; String oa_original_id;
String oa_name; String oa_name;
String oa_acronym; String oa_acronym;
String oa_country; String oa_country;
String oa_url; String oa_url;
String oa_collectedfrom; String oa_collectedfrom;
public OrgSimRel() { public OrgSimRel() {
} }
public OrgSimRel(String local_id, String oa_original_id, String oa_name, String oa_acronym, String oa_country, String oa_url, String oa_collectedfrom) { public OrgSimRel(String local_id, String oa_original_id, String oa_name, String oa_acronym, String oa_country,
this.local_id = local_id; String oa_url, String oa_collectedfrom) {
this.oa_original_id = oa_original_id; this.local_id = local_id;
this.oa_name = oa_name; this.oa_original_id = oa_original_id;
this.oa_acronym = oa_acronym; this.oa_name = oa_name;
this.oa_country = oa_country; this.oa_acronym = oa_acronym;
this.oa_url = oa_url; this.oa_country = oa_country;
this.oa_collectedfrom = oa_collectedfrom; this.oa_url = oa_url;
} this.oa_collectedfrom = oa_collectedfrom;
}
public String getLocal_id() { public String getLocal_id() {
return local_id; return local_id;
} }
public void setLocal_id(String local_id) { public void setLocal_id(String local_id) {
this.local_id = local_id; this.local_id = local_id;
} }
public String getOa_original_id() { public String getOa_original_id() {
return oa_original_id; return oa_original_id;
} }
public void setOa_original_id(String oa_original_id) { public void setOa_original_id(String oa_original_id) {
this.oa_original_id = oa_original_id; this.oa_original_id = oa_original_id;
} }
public String getOa_name() { public String getOa_name() {
return oa_name; return oa_name;
} }
public void setOa_name(String oa_name) { public void setOa_name(String oa_name) {
this.oa_name = oa_name; this.oa_name = oa_name;
} }
public String getOa_acronym() { public String getOa_acronym() {
return oa_acronym; return oa_acronym;
} }
public void setOa_acronym(String oa_acronym) { public void setOa_acronym(String oa_acronym) {
this.oa_acronym = oa_acronym; this.oa_acronym = oa_acronym;
} }
public String getOa_country() { public String getOa_country() {
return oa_country; return oa_country;
} }
public void setOa_country(String oa_country) { public void setOa_country(String oa_country) {
this.oa_country = oa_country; this.oa_country = oa_country;
} }
public String getOa_url() { public String getOa_url() {
return oa_url; return oa_url;
} }
public void setOa_url(String oa_url) { public void setOa_url(String oa_url) {
this.oa_url = oa_url; this.oa_url = oa_url;
} }
public String getOa_collectedfrom() { public String getOa_collectedfrom() {
return oa_collectedfrom; return oa_collectedfrom;
} }
public void setOa_collectedfrom(String oa_collectedfrom) { public void setOa_collectedfrom(String oa_collectedfrom) {
this.oa_collectedfrom = oa_collectedfrom; this.oa_collectedfrom = oa_collectedfrom;
} }
@Override
public String toString() {
return "OrgSimRel{" +
"local_id='" + local_id + '\'' +
", oa_original_id='" + oa_original_id + '\'' +
", oa_name='" + oa_name + '\'' +
", oa_acronym='" + oa_acronym + '\'' +
", oa_country='" + oa_country + '\'' +
", oa_url='" + oa_url + '\'' +
", oa_collectedfrom='" + oa_collectedfrom + '\'' +
'}';
}
} }

View File

@ -1,25 +1,17 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
public enum PidType { public enum PidType {
//from the less to the more important // from the less to the more important
undefined, undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi;
original,
orcid,
ror,
grid,
pdb,
arXiv,
pmid,
doi;
public static PidType classidValueOf(String s){ public static PidType classidValueOf(String s) {
try { try {
return PidType.valueOf(s); return PidType.valueOf(s);
} } catch (Exception e) {
catch (Exception e) { return PidType.undefined;
return PidType.undefined; }
} }
}
} }

View File

@ -1,21 +1,5 @@
package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
@ -24,153 +8,177 @@ import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.StreamSupport; import java.util.stream.StreamSupport;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.sql.*;
import org.dom4j.DocumentException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import scala.Tuple2;
public class SparkCollectSimRels extends AbstractSparkAction { public class SparkCollectSimRels extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCollectSimRels.class); private static final Logger log = LoggerFactory.getLogger(SparkCollectSimRels.class);
Dataset<Row> simGroupsDS; Dataset<Row> simGroupsDS;
Dataset<Row> groupsDS; Dataset<Row> groupsDS;
public SparkCollectSimRels(ArgumentApplicationParser parser, SparkSession spark, Dataset<Row> simGroupsDS, Dataset<Row> groupsDS) { public SparkCollectSimRels(ArgumentApplicationParser parser, SparkSession spark, Dataset<Row> simGroupsDS,
super(parser, spark); Dataset<Row> groupsDS) {
this.simGroupsDS = simGroupsDS; super(parser, spark);
this.groupsDS = groupsDS; this.simGroupsDS = simGroupsDS;
} this.groupsDS = groupsDS;
}
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
SparkBlockStats.class SparkBlockStats.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json"))); "/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
parser.parseArgument(args); parser.parseArgument(args);
SparkConf conf = new SparkConf(); SparkConf conf = new SparkConf();
final String dbUrl = parser.get("postgresUrl"); final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser"); final String dbUser = parser.get("postgresUser");
final String dbPassword = parser.get("postgresPassword"); final String dbPassword = parser.get("postgresPassword");
SparkSession spark = getSparkSession(conf); SparkSession spark = getSparkSession(conf);
DataFrameReader readOptions = spark.read() DataFrameReader readOptions = spark
.format("jdbc") .read()
.option("url", dbUrl) .format("jdbc")
.option("user", dbUser) .option("url", dbUrl)
.option("password", dbPassword); .option("user", dbUser)
.option("password", dbPassword);
new SparkCollectSimRels( new SparkCollectSimRels(
parser, parser,
spark, spark,
readOptions.option("dbtable", "similarity_groups").load(), readOptions.option("dbtable", "similarity_groups").load(),
readOptions.option("dbtable", "groups").load() readOptions.option("dbtable", "groups").load())
).run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
} }
@Override @Override
void run(ISLookUpService isLookUpService) throws DocumentException, ISLookUpException, IOException { void run(ISLookUpService isLookUpService) throws DocumentException, ISLookUpException, IOException {
// read oozie parameters // read oozie parameters
final String isLookUpUrl = parser.get("isLookUpUrl"); final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId"); final String actionSetId = parser.get("actionSetId");
final String workingPath = parser.get("workingPath"); final String workingPath = parser.get("workingPath");
final int numPartitions = Optional final int numPartitions = Optional
.ofNullable(parser.get("numPartitions")) .ofNullable(parser.get("numPartitions"))
.map(Integer::valueOf) .map(Integer::valueOf)
.orElse(NUM_PARTITIONS); .orElse(NUM_PARTITIONS);
final String dbUrl = parser.get("postgresUrl"); final String dbUrl = parser.get("postgresUrl");
final String dbUser = parser.get("postgresUser"); final String dbUser = parser.get("postgresUser");
log.info("numPartitions: '{}'", numPartitions); log.info("numPartitions: '{}'", numPartitions);
log.info("isLookUpUrl: '{}'", isLookUpUrl); log.info("isLookUpUrl: '{}'", isLookUpUrl);
log.info("actionSetId: '{}'", actionSetId); log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath); log.info("workingPath: '{}'", workingPath);
log.info("postgresUser: {}", dbUser); log.info("postgresUser: {}", dbUser);
log.info("postgresUrl: {}", dbUrl); log.info("postgresUrl: {}", dbUrl);
log.info("postgresPassword: xxx"); log.info("postgresPassword: xxx");
JavaPairRDD<String, List<String>> similarityGroup = JavaPairRDD<String, List<String>> similarityGroup = simGroupsDS
simGroupsDS .toJavaRDD()
.toJavaRDD() .mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)))
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1))) .groupByKey()
.groupByKey() .mapToPair(
.mapToPair(i -> new Tuple2<>(i._1(), StreamSupport.stream(i._2().spliterator(), false) i -> new Tuple2<>(i._1(), StreamSupport
.collect(Collectors.toList()))); .stream(i._2().spliterator(), false)
.collect(Collectors.toList())));
JavaPairRDD<String, String> groupIds = JavaPairRDD<String, String> groupIds = groupsDS
groupsDS .toJavaRDD()
.toJavaRDD() .mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)));
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)));
JavaRDD<Tuple2<Tuple2<String, String>, List<String>>> groups = similarityGroup JavaRDD<Tuple2<Tuple2<String, String>, List<String>>> groups = similarityGroup
.leftOuterJoin(groupIds) .leftOuterJoin(groupIds)
.filter(g -> g._2()._2().isPresent()) .filter(g -> g._2()._2().isPresent())
.map(g -> new Tuple2<>(new Tuple2<>(g._1(), g._2()._2().get()), g._2()._1())); .map(g -> new Tuple2<>(new Tuple2<>(g._1(), g._2()._2().get()), g._2()._1()));
JavaRDD<Relation> relations = groups.flatMap(g -> { JavaRDD<Relation> relations = groups.flatMap(g -> {
String firstId = g._2().get(0); String firstId = g._2().get(0);
List<Relation> rels = new ArrayList<>(); List<Relation> rels = new ArrayList<>();
for (String id : g._2()) { for (String id : g._2()) {
if (!firstId.equals(id)) if (!firstId.equals(id))
rels.add(createSimRel(firstId, id, g._1()._2())); rels.add(createSimRel(firstId, id, g._1()._2()));
} }
return rels.iterator(); return rels.iterator();
}); });
Dataset<Relation> resultRelations = spark.createDataset( Dataset<Relation> resultRelations = spark
relations.filter(r -> r.getRelType().equals("resultResult")).rdd(), .createDataset(
Encoders.bean(Relation.class) relations.filter(r -> r.getRelType().equals("resultResult")).rdd(),
).repartition(numPartitions); Encoders.bean(Relation.class))
.repartition(numPartitions);
Dataset<Relation> organizationRelations = spark.createDataset( Dataset<Relation> organizationRelations = spark
relations.filter(r -> r.getRelType().equals("organizationOrganization")).rdd(), .createDataset(
Encoders.bean(Relation.class) relations.filter(r -> r.getRelType().equals("organizationOrganization")).rdd(),
).repartition(numPartitions); Encoders.bean(Relation.class))
.repartition(numPartitions);
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) { for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
switch(dedupConf.getWf().getSubEntityValue()){ switch (dedupConf.getWf().getSubEntityValue()) {
case "organization": case "organization":
savePostgresRelation(organizationRelations, workingPath, actionSetId, "organization"); savePostgresRelation(organizationRelations, workingPath, actionSetId, "organization");
break; break;
default: default:
savePostgresRelation(resultRelations, workingPath, actionSetId, dedupConf.getWf().getSubEntityValue()); savePostgresRelation(
break; resultRelations, workingPath, actionSetId, dedupConf.getWf().getSubEntityValue());
} break;
} }
}
} }
private Relation createSimRel(String source, String target, String entity) { private Relation createSimRel(String source, String target, String entity) {
final Relation r = new Relation(); final Relation r = new Relation();
r.setSubRelType("dedupSimilarity"); r.setSubRelType("dedupSimilarity");
r.setRelClass("isSimilarTo"); r.setRelClass("isSimilarTo");
r.setDataInfo(new DataInfo()); r.setDataInfo(new DataInfo());
switch (entity) { switch (entity) {
case "result": case "result":
r.setSource("50|" + source); r.setSource("50|" + source);
r.setTarget("50|" + target); r.setTarget("50|" + target);
r.setRelType("resultResult"); r.setRelType("resultResult");
break; break;
case "organization": case "organization":
r.setSource("20|" + source); r.setSource("20|" + source);
r.setTarget("20|" + target); r.setTarget("20|" + target);
r.setRelType("organizationOrganization"); r.setRelType("organizationOrganization");
break; break;
default: default:
throw new IllegalArgumentException("unmanaged entity type: " + entity); throw new IllegalArgumentException("unmanaged entity type: " + entity);
} }
return r; return r;
} }
private void savePostgresRelation(Dataset<Relation> newRelations, String workingPath, String actionSetId, String entityType) { private void savePostgresRelation(Dataset<Relation> newRelations, String workingPath, String actionSetId,
newRelations String entityType) {
.write() newRelations
.mode(SaveMode.Append) .write()
.parquet(DedupUtility.createSimRelPath(workingPath, actionSetId, entityType)); .mode(SaveMode.Append)
} .parquet(DedupUtility.createSimRelPath(workingPath, actionSetId, entityType));
}
} }

View File

@ -104,13 +104,13 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s)) .map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s)); .mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
final RDD<Edge<String>> edgeRdd = spark final RDD<Edge<String>> edgeRdd = spark
.read() .read()
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity)) .load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
.as(Encoders.bean(Relation.class)) .as(Encoders.bean(Relation.class))
.javaRDD() .javaRDD()
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass())) .map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
.rdd(); .rdd();
final Dataset<Relation> mergeRels = spark final Dataset<Relation> mergeRels = spark
.createDataset( .createDataset(

View File

@ -100,17 +100,17 @@ public class SparkCreateSimRels extends AbstractSparkAction {
.repartition(numPartitions); .repartition(numPartitions);
// create relations by comparing only elements in the same group // create relations by comparing only elements in the same group
spark.createDataset( spark
Deduper .createDataset(
.computeRelations(sc, blocks, dedupConf) Deduper
.map(t -> createSimRel(t._1(), t._2(), entity)) .computeRelations(sc, blocks, dedupConf)
.repartition(numPartitions) .map(t -> createSimRel(t._1(), t._2(), entity))
.rdd(), .repartition(numPartitions)
Encoders.bean(Relation.class) .rdd(),
) Encoders.bean(Relation.class))
.write() .write()
.mode(SaveMode.Append) .mode(SaveMode.Append)
.parquet(outputPath); .parquet(outputPath);
} }
} }

View File

@ -1,13 +1,11 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import static jdk.nashorn.internal.objects.NativeDebug.map;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport; import java.io.IOException;
import eu.dnetlib.dhp.schema.oaf.*; import java.util.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse; import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet; import org.apache.http.client.methods.HttpGet;
@ -15,6 +13,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients; import org.apache.http.impl.client.HttpClients;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction; import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -24,145 +23,172 @@ import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException; import org.dom4j.DocumentException;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.io.IOException; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import java.util.ArrayList; import eu.dnetlib.dhp.schema.common.EntityType;
import java.util.Iterator; import eu.dnetlib.dhp.schema.common.ModelSupport;
import java.util.List; import eu.dnetlib.dhp.schema.oaf.*;
import java.util.Properties; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.config.DedupConfig;
import scala.Tuple2;
public class SparkPrepareOrgRels extends AbstractSparkAction { public class SparkPrepareOrgRels extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class); private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class);
public static final String ROOT_TRUST = "0.8"; public SparkPrepareOrgRels(ArgumentApplicationParser parser, SparkSession spark) {
public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup"; super(parser, spark);
public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions"; }
public SparkPrepareOrgRels(ArgumentApplicationParser parser, SparkSession spark) { public static void main(String[] args) throws Exception {
super(parser, spark); ArgumentApplicationParser parser = new ArgumentApplicationParser(
} IOUtils
.toString(
SparkCreateSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/prepareOrgRels_parameters.json")));
parser.parseArgument(args);
public static void main(String[] args) throws Exception { SparkConf conf = new SparkConf();
ArgumentApplicationParser parser = new ArgumentApplicationParser( conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
IOUtils conf.registerKryoClasses(ModelSupport.getOafModelClasses());
.toString(
SparkCreateSimRels.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/prepareOrgRels_parameters.json")));
parser.parseArgument(args);
SparkConf conf = new SparkConf(); new SparkPrepareOrgRels(parser, getSparkSession(conf))
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); .run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
conf.registerKryoClasses(ModelSupport.getOafModelClasses()); }
new SparkCreateDedupRecord(parser, getSparkSession(conf)) @Override
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl"))); public void run(ISLookUpService isLookUpService) throws IOException {
}
@Override final String graphBasePath = parser.get("graphBasePath");
public void run(ISLookUpService isLookUpService) throws IOException { final String isLookUpUrl = parser.get("isLookUpUrl");
final String actionSetId = parser.get("actionSetId");
final String workingPath = parser.get("workingPath");
final int numConnections = Optional
.ofNullable(parser.get("numConnections"))
.map(Integer::valueOf)
.orElse(NUM_CONNECTIONS);
final String graphBasePath = parser.get("graphBasePath"); final String apiUrl = Optional
final String isLookUpUrl = parser.get("isLookUpUrl"); .ofNullable(parser.get("apiUrl"))
final String actionSetId = parser.get("actionSetId"); .orElse("");
final String workingPath = parser.get("workingPath");
final String apiUrl = parser.get("apiUrl");
final String dbUrl = parser.get("dbUrl");
final String dbTable = parser.get("dbTable");
final String dbUser = parser.get("dbUser");
final String dbPwd = parser.get("dbPwd");
log.info("graphBasePath: '{}'", graphBasePath); final String dbUrl = parser.get("dbUrl");
log.info("isLookUpUrl: '{}'", isLookUpUrl); final String dbTable = parser.get("dbTable");
log.info("actionSetId: '{}'", actionSetId); final String dbUser = parser.get("dbUser");
log.info("workingPath: '{}'", workingPath); final String dbPwd = parser.get("dbPwd");
log.info("apiUrl: '{}'", apiUrl);
log.info("dbUrl: '{}'", dbUrl);
log.info("dbUser: '{}'", dbUser);
log.info("table: '{}'", dbTable);
log.info("dbPwd: '{}'", "xxx");
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization"); log.info("graphBasePath: '{}'", graphBasePath);
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization"); log.info("isLookUpUrl: '{}'", isLookUpUrl);
log.info("actionSetId: '{}'", actionSetId);
log.info("workingPath: '{}'", workingPath);
log.info("numPartitions: '{}'", numConnections);
log.info("apiUrl: '{}'", apiUrl);
log.info("dbUrl: '{}'", dbUrl);
log.info("dbUser: '{}'", dbUser);
log.info("table: '{}'", dbTable);
log.info("dbPwd: '{}'", "xxx");
Dataset<OrgSimRel> relations = createRelations(spark, mergeRelPath, entityPath); final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
final Properties connectionProperties = new Properties(); Dataset<OrgSimRel> relations = createRelations(spark, mergeRelPath, entityPath);
connectionProperties.put("user", dbUser);
connectionProperties.put("password", dbPwd);
relations.write().mode(SaveMode.Overwrite).jdbc(dbUrl, dbTable, connectionProperties); final Properties connectionProperties = new Properties();
connectionProperties.put("user", dbUser);
connectionProperties.put("password", dbPwd);
if (!apiUrl.isEmpty()) relations
updateSimRels(apiUrl); .repartition(numConnections)
.write()
.mode(SaveMode.Overwrite)
.jdbc(dbUrl, dbTable, connectionProperties);
} if (!apiUrl.isEmpty())
updateSimRels(apiUrl);
public static Dataset<OrgSimRel> createRelations( }
final SparkSession spark,
final String mergeRelsPath,
final String entitiesPath) {
// <id, json_entity> public static Dataset<OrgSimRel> createRelations(
Dataset<Tuple2<String, Organization>> entities = spark final SparkSession spark,
.read() final String mergeRelsPath,
.textFile(entitiesPath) final String entitiesPath) {
.map(
(MapFunction<String, Tuple2<String, Organization>>) it -> {
Organization entity = OBJECT_MAPPER.readValue(it, Organization.class);
return new Tuple2<>(entity.getId(), entity);
},
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Organization.class)));
Dataset<Tuple2<String, String>> relations = spark.createDataset( // <id, json_entity>
spark Dataset<Tuple2<String, Organization>> entities = spark
.read() .read()
.load(mergeRelsPath) .textFile(entitiesPath)
.as(Encoders.bean(Relation.class)) .map(
.where("relClass == 'merges'") (MapFunction<String, Tuple2<String, Organization>>) it -> {
.toJavaRDD() Organization entity = OBJECT_MAPPER.readValue(it, Organization.class);
.mapToPair(r -> new Tuple2<>(r.getSource(), r.getTarget())) return new Tuple2<>(entity.getId(), entity);
.groupByKey() },
.flatMap(g -> { Encoders.tuple(Encoders.STRING(), Encoders.kryo(Organization.class)));
List<Tuple2<String, String>> rels = new ArrayList<>();
for (String id1 : g._2()) {
for (String id2 : g._2()) {
if (!id1.equals(id2))
if (id1.contains("openorgs"))
rels.add(new Tuple2<>(id1, id2));
}
}
return rels.iterator();
}).rdd(),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
return relations Dataset<Tuple2<String, String>> relations = spark
.joinWith(entities, relations.col("_2").equalTo(entities.col("_1")), "inner") .createDataset(
.map( spark
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, Organization>>, OrgSimRel>)r -> .read()
new OrgSimRel( .load(mergeRelsPath)
r._1()._2(), .as(Encoders.bean(Relation.class))
r._2()._2().getOriginalId().get(0), .where("relClass == 'merges'")
r._2()._2().getLegalname().getValue(), .toJavaRDD()
r._2()._2().getLegalshortname().getValue(), .mapToPair(r -> new Tuple2<>(r.getSource(), r.getTarget()))
r._2()._2().getCountry().getClassid(), .groupByKey()
r._2()._2().getWebsiteurl().getValue(), .flatMap(g -> {
r._2()._2().getCollectedfrom().get(0).getValue() List<Tuple2<String, String>> rels = new ArrayList<>();
), for (String id1 : g._2()) {
Encoders.bean(OrgSimRel.class) for (String id2 : g._2()) {
); if (!id1.equals(id2))
if (id1.contains("openorgs____") && !id2.contains("openorgsmesh"))
rels.add(new Tuple2<>(id1, id2));
}
}
return rels.iterator();
})
.rdd(),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
} Dataset<Tuple2<String, OrgSimRel>> relations2 = relations // <openorgs, corda>
.joinWith(entities, relations.col("_2").equalTo(entities.col("_1")), "inner")
.map(
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, Organization>>, OrgSimRel>) r -> new OrgSimRel(
r._1()._1(),
r._2()._2().getOriginalId().get(0),
r._2()._2().getLegalname() != null ? r._2()._2().getLegalname().getValue() : "",
r._2()._2().getLegalshortname() != null ? r._2()._2().getLegalshortname().getValue() : "",
r._2()._2().getCountry() != null ? r._2()._2().getCountry().getClassid() : "",
r._2()._2().getWebsiteurl() != null ? r._2()._2().getWebsiteurl().getValue() : "",
r._2()._2().getCollectedfrom().get(0).getValue()),
Encoders.bean(OrgSimRel.class))
.map(
(MapFunction<OrgSimRel, Tuple2<String, OrgSimRel>>) o -> new Tuple2<>(o.getLocal_id(), o),
Encoders.tuple(Encoders.STRING(), Encoders.bean(OrgSimRel.class)));
private static String updateSimRels(final String apiUrl) throws IOException { return relations2
final HttpGet req = new HttpGet(apiUrl); .joinWith(entities, relations2.col("_1").equalTo(entities.col("_1")), "inner")
try (final CloseableHttpClient client = HttpClients.createDefault()) { .map(
try (final CloseableHttpResponse response = client.execute(req)) { (MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
return IOUtils.toString(response.getEntity().getContent()); OrgSimRel orgSimRel = r._1()._2();
} orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
} return orgSimRel;
} },
Encoders.bean(OrgSimRel.class));
}
private static String updateSimRels(final String apiUrl) throws IOException {
log.info("Updating simrels on the portal");
final HttpGet req = new HttpGet(apiUrl);
try (final CloseableHttpClient client = HttpClients.createDefault()) {
try (final CloseableHttpResponse response = client.execute(req)) {
return IOUtils.toString(response.getEntity().getContent());
}
}
}
} }

View File

@ -1,4 +1,4 @@
<workflow-app name="Duplicate Scan" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="Organization Dedup" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property> <property>
<name>graphBasePath</name> <name>graphBasePath</name>
@ -24,10 +24,6 @@
<name>cutConnectedComponent</name> <name>cutConnectedComponent</name>
<description>max number of elements in a connected component</description> <description>max number of elements in a connected component</description>
</property> </property>
<property>
<name>apiUrl</name>
<description>the url for the APIs of the openorgs service</description>
</property>
<property> <property>
<name>dbUrl</name> <name>dbUrl</name>
<description>the url of the database</description> <description>the url of the database</description>
@ -109,6 +105,16 @@
<fs> <fs>
<delete path="${workingPath}"/> <delete path="${workingPath}"/>
</fs> </fs>
<ok to="copyRelations"/>
<error to="Kill"/>
</action>
<action name="copyRelations">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>-pb</arg>
<arg>${graphBasePath}/relation</arg>
<arg>${workingPath}/${actionSetId}/organization_simrel</arg>
</distcp>
<ok to="CreateSimRel"/> <ok to="CreateSimRel"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -136,16 +142,6 @@
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>--numPartitions</arg><arg>8000</arg> <arg>--numPartitions</arg><arg>8000</arg>
</spark> </spark>
<ok to="copyRelations"/>
<error to="Kill"/>
</action>
<action name="copyRelations">
<distcp xmlns="uri:oozie:distcp-action:0.2">
<arg>-pb</arg>
<arg>${graphBasePath}/relation</arg>
<arg>${workingPath}/organization_simrel</arg>
</distcp>
<ok to="CreateMergeRel"/> <ok to="CreateMergeRel"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
@ -203,6 +199,7 @@
<arg>--dbTable</arg><arg>${dbTable}</arg> <arg>--dbTable</arg><arg>${dbTable}</arg>
<arg>--dbUser</arg><arg>${dbUser}</arg> <arg>--dbUser</arg><arg>${dbUser}</arg>
<arg>--dbPwd</arg><arg>${dbPwd}</arg> <arg>--dbPwd</arg><arg>${dbPwd}</arg>
<arg>--numConnections</arg><arg>20</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -23,11 +23,17 @@
"paramDescription": "the id of the actionset (orchestrator)", "paramDescription": "the id of the actionset (orchestrator)",
"paramRequired": true "paramRequired": true
}, },
{
"paramName": "nc",
"paramLongName": "numConnections",
"paramDescription": "number of connections to the postgres db (for the write operation)",
"paramRequired": false
},
{ {
"paramName": "au", "paramName": "au",
"paramLongName": "apiUrl", "paramLongName": "apiUrl",
"paramDescription": "the url for the APIs of the openorgs service", "paramDescription": "the url for the APIs of the openorgs service",
"paramRequired": true "paramRequired": false
}, },
{ {
"paramName": "du", "paramName": "du",

View File

@ -138,10 +138,10 @@ public class EntityMergerTest implements Serializable {
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException { public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals( "50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId()); assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
@ -149,7 +149,7 @@ public class EntityMergerTest implements Serializable {
public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException { public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId()); assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
@ -160,7 +160,7 @@ public class EntityMergerTest implements Serializable {
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException { public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId()); assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());

View File

@ -1,11 +1,18 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.ObjectMapper; import static java.nio.file.Files.createTempDirectory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation; import static org.apache.spark.sql.functions.count;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import static org.junit.jupiter.api.Assertions.assertEquals;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import static org.mockito.Mockito.lenient;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -21,19 +28,16 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkDedupTest implements Serializable { public class SparkDedupTest implements Serializable {
@ -48,7 +52,7 @@ public class SparkDedupTest implements Serializable {
private static String testOutputBasePath; private static String testOutputBasePath;
private static String testDedupGraphBasePath; private static String testDedupGraphBasePath;
private static final String testActionSetId = "test-orchestrator"; private static final String testActionSetId = "test-orchestrator";
private static String testDedupAssertionsBasePath; private static String testDedupAssertionsBasePath;
@BeforeAll @BeforeAll
public static void cleanUp() throws IOException, URISyntaxException { public static void cleanUp() throws IOException, URISyntaxException {
@ -64,9 +68,9 @@ public class SparkDedupTest implements Serializable {
.toAbsolutePath() .toAbsolutePath()
.toString(); .toString();
testDedupAssertionsBasePath = Paths testDedupAssertionsBasePath = Paths
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/assertions").toURI()) .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/assertions").toURI())
.toFile() .toFile()
.getAbsolutePath(); .getAbsolutePath();
FileUtils.deleteDirectory(new File(testOutputBasePath)); FileUtils.deleteDirectory(new File(testOutputBasePath));
FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
@ -82,7 +86,7 @@ public class SparkDedupTest implements Serializable {
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext()); jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
} }
@BeforeEach @BeforeEach
public void setUp() throws IOException, ISLookUpException { public void setUp() throws IOException, ISLookUpException {
@ -165,98 +169,98 @@ public class SparkDedupTest implements Serializable {
new SparkCreateSimRels(parser, spark).run(isLookUpService); new SparkCreateSimRels(parser, spark).run(isLookUpService);
long orgs_simrel = spark long orgs_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
.count(); .count();
long pubs_simrel = spark long pubs_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
.count(); .count();
long sw_simrel = spark long sw_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
.count(); .count();
long ds_simrel = spark long ds_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
.count(); .count();
long orp_simrel = spark long orp_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count(); .count();
assertEquals(3432, orgs_simrel); assertEquals(3082, orgs_simrel);
assertEquals(7152, pubs_simrel); assertEquals(7036, pubs_simrel);
assertEquals(344, sw_simrel); assertEquals(344, sw_simrel);
assertEquals(458, ds_simrel); assertEquals(442, ds_simrel);
assertEquals(6750, orp_simrel); assertEquals(6750, orp_simrel);
} }
@Test @Test
@Order(2) @Order(2)
public void collectSimRelsTest() throws Exception { public void collectSimRelsTest() throws Exception {
ArgumentApplicationParser parser = new ArgumentApplicationParser( ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils IOUtils
.toString( .toString(
SparkCreateSimRels.class SparkCollectSimRels.class
.getResourceAsStream( .getResourceAsStream(
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json"))); "/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
parser parser
.parseArgument( .parseArgument(
new String[] { new String[] {
"-asi", testActionSetId, "-asi", testActionSetId,
"-la", "lookupurl", "-la", "lookupurl",
"-w", testOutputBasePath, "-w", testOutputBasePath,
"-np", "50", "-np", "50",
"-purl", "jdbc:postgresql://localhost:5432/dnet_dedup", "-purl", "jdbc:postgresql://localhost:5432/dnet_dedup",
"-pusr", "postgres_url", "-pusr", "postgres_user",
"-ppwd", "" "-ppwd", ""
}); });
new SparkCollectSimRels( new SparkCollectSimRels(
parser, parser,
spark, spark,
spark.read().load(testDedupAssertionsBasePath + "/similarity_groups"), spark.read().load(testDedupAssertionsBasePath + "/similarity_groups"),
spark.read().load(testDedupAssertionsBasePath + "/groups") spark.read().load(testDedupAssertionsBasePath + "/groups"))
).run(null); .run(isLookUpService);
long orgs_simrel = spark long orgs_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
.count(); .count();
long pubs_simrel = spark long pubs_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
.count(); .count();
long sw_simrel = spark long sw_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
.count(); .count();
long ds_simrel = spark long ds_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
.count(); .count();
long orp_simrel = spark long orp_simrel = spark
.read() .read()
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
.count(); .count();
assertEquals(4022, orgs_simrel); assertEquals(3672, orgs_simrel);
assertEquals(10575, pubs_simrel); assertEquals(10459, pubs_simrel);
assertEquals(3767, sw_simrel); assertEquals(3767, sw_simrel);
assertEquals(3881, ds_simrel); assertEquals(3865, ds_simrel);
assertEquals(10173, orp_simrel); assertEquals(10173, orp_simrel);
} }
@Test @Test
@Order(3) @Order(3)
@ -402,8 +406,8 @@ public class SparkDedupTest implements Serializable {
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel") .load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
.count(); .count();
assertEquals(1276, orgs_mergerel); assertEquals(1272, orgs_mergerel);
assertEquals(1442, pubs_mergerel); assertEquals(1438, pubs_mergerel);
assertEquals(288, sw_mergerel); assertEquals(288, sw_mergerel);
assertEquals(472, ds_mergerel); assertEquals(472, ds_mergerel);
assertEquals(718, orp_mergerel); assertEquals(718, orp_mergerel);
@ -449,10 +453,10 @@ public class SparkDedupTest implements Serializable {
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord") testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
.count(); .count();
assertEquals(82, orgs_deduprecord); assertEquals(84, orgs_deduprecord);
assertEquals(66, pubs_deduprecord); assertEquals(65, pubs_deduprecord);
assertEquals(51, sw_deduprecord); assertEquals(51, sw_deduprecord);
assertEquals(96, ds_deduprecord); assertEquals(97, ds_deduprecord);
assertEquals(89, orp_deduprecord); assertEquals(89, orp_deduprecord);
} }
@ -532,12 +536,12 @@ public class SparkDedupTest implements Serializable {
.distinct() .distinct()
.count(); .count();
assertEquals(897, publications); assertEquals(896, publications);
assertEquals(835, organizations); assertEquals(837, organizations);
assertEquals(100, projects); assertEquals(100, projects);
assertEquals(100, datasource); assertEquals(100, datasource);
assertEquals(200, softwares); assertEquals(200, softwares);
assertEquals(388, dataset); assertEquals(389, dataset);
assertEquals(517, otherresearchproduct); assertEquals(517, otherresearchproduct);
long deletedOrgs = jsc long deletedOrgs = jsc
@ -592,7 +596,7 @@ public class SparkDedupTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count(); long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(4866, relations); assertEquals(4858, relations);
// check deletedbyinference // check deletedbyinference
final Dataset<Relation> mergeRels = spark final Dataset<Relation> mergeRels = spark
@ -641,11 +645,11 @@ public class SparkDedupTest implements Serializable {
assertEquals(expected_unique, rel.distinct().count()); assertEquals(expected_unique, rel.distinct().count());
} }
// @AfterAll @AfterAll
// public static void finalCleanUp() throws IOException { public static void finalCleanUp() throws IOException {
// FileUtils.deleteDirectory(new File(testOutputBasePath)); FileUtils.deleteDirectory(new File(testOutputBasePath));
// FileUtils.deleteDirectory(new File(testDedupGraphBasePath)); FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
// } }
public boolean isDeletedByInference(String s) { public boolean isDeletedByInference(String s) {
return s.contains("\"deletedbyinference\":true"); return s.contains("\"deletedbyinference\":true");

View File

@ -315,7 +315,7 @@
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>
<artifactId>dnet-pace-core</artifactId> <artifactId>dnet-pace-core</artifactId>
<version>4.0.4</version> <version>4.0.5</version>
</dependency> </dependency>
<dependency> <dependency>
<groupId>eu.dnetlib</groupId> <groupId>eu.dnetlib</groupId>