forked from antonis.lempesis/dnet-hadoop
minor changes in dedup tests, bug fix in the idgenerator and pace-core version update
This commit is contained in:
parent
4cf79f32eb
commit
e3f7798d1b
|
@ -90,6 +90,10 @@
|
||||||
<groupId>com.fasterxml.jackson.core</groupId>
|
<groupId>com.fasterxml.jackson.core</groupId>
|
||||||
<artifactId>jackson-core</artifactId>
|
<artifactId>jackson-core</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpclient</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
</dependencies>
|
</dependencies>
|
||||||
|
|
||||||
|
|
|
@ -29,6 +29,7 @@ import eu.dnetlib.pace.config.DedupConfig;
|
||||||
abstract class AbstractSparkAction implements Serializable {
|
abstract class AbstractSparkAction implements Serializable {
|
||||||
|
|
||||||
protected static final int NUM_PARTITIONS = 1000;
|
protected static final int NUM_PARTITIONS = 1000;
|
||||||
|
protected static final int NUM_CONNECTIONS = 20;
|
||||||
|
|
||||||
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
|
@ -1,12 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
import java.text.ParseException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import java.text.SimpleDateFormat;
|
||||||
import com.google.common.collect.Lists;
|
import java.util.*;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
|
@ -15,11 +13,15 @@ import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.text.ParseException;
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import java.text.SimpleDateFormat;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import java.util.*;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class DedupRecordFactory {
|
public class DedupRecordFactory {
|
||||||
|
|
||||||
|
@ -80,14 +82,14 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
final Collection<String> dates = Lists.newArrayList();
|
final Collection<String> dates = Lists.newArrayList();
|
||||||
final List<List<Author>> authors = Lists.newArrayList();
|
final List<List<Author>> authors = Lists.newArrayList();
|
||||||
final List<Identifier> bestPids = Lists.newArrayList(); //best pids list
|
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
|
||||||
|
|
||||||
entities
|
entities
|
||||||
.forEachRemaining(
|
.forEachRemaining(
|
||||||
t -> {
|
t -> {
|
||||||
T duplicate = t._2();
|
T duplicate = t._2();
|
||||||
|
|
||||||
//prepare the list of pids to use for the id generation
|
// prepare the list of pids to use for the id generation
|
||||||
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
|
bestPids.addAll(IdGenerator.bestPidtoIdentifier(duplicate));
|
||||||
|
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
|
@ -115,5 +117,4 @@ public class DedupRecordFactory {
|
||||||
return entity;
|
return entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,90 +1,112 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import org.apache.commons.lang.NullArgumentException;
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.text.ParseException;
|
import java.text.ParseException;
|
||||||
import java.text.SimpleDateFormat;
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.NullArgumentException;
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
public class IdGenerator implements Serializable {
|
public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
|
||||||
|
|
||||||
//pick the best pid from the list (consider date and pidtype)
|
// pick the best pid from the list (consider date and pidtype)
|
||||||
public static String generate(List<Identifier> pids, String defaultID) {
|
public static String generate(List<Identifier> pids, String defaultID) {
|
||||||
if (pids == null || pids.size() == 0)
|
if (pids == null || pids.size() == 0)
|
||||||
return defaultID;
|
return defaultID;
|
||||||
|
|
||||||
Optional<Identifier> bp = pids.stream()
|
Optional<Identifier> bp = pids
|
||||||
.max(Identifier::compareTo);
|
.stream()
|
||||||
|
.max(Identifier::compareTo);
|
||||||
|
|
||||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" + DedupUtility.md5(bp.get().getOriginalID());
|
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
||||||
} else {
|
+ DedupUtility.md5(bp.get().getOriginalID());
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|" + createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::" + DedupUtility.md5(bp.get().getPid().getValue());
|
} else {
|
||||||
}
|
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
||||||
|
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
||||||
|
+ DedupUtility.md5(bp.get().getPid().getValue());
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||||
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
|
public static <T extends OafEntity> List<Identifier> bestPidtoIdentifier(T entity) {
|
||||||
|
|
||||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||||
return Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()));
|
return Lists
|
||||||
|
.newArrayList(
|
||||||
|
new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(),
|
||||||
|
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||||
|
|
||||||
Optional<StructuredProperty> bp = entity.getPid().stream()
|
Optional<StructuredProperty> bp = entity
|
||||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
.getPid()
|
||||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
.stream()
|
||||||
|
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
||||||
|
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
||||||
|
|
||||||
return bp.map(structuredProperty ->
|
return bp
|
||||||
Lists.newArrayList(new Identifier(structuredProperty, extractDate(entity, sdf), PidType.classidValueOf(structuredProperty.getQualifier().getClassid()), entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId()))
|
.map(
|
||||||
).orElseGet(() -> Lists.newArrayList(new Identifier(new StructuredProperty(), new Date(), PidType.original, entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
|
structuredProperty -> Lists
|
||||||
|
.newArrayList(
|
||||||
|
new Identifier(structuredProperty, extractDate(entity, new SimpleDateFormat("yyyy-MM-dd")),
|
||||||
|
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
||||||
|
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
||||||
|
.orElseGet(
|
||||||
|
() -> Lists
|
||||||
|
.newArrayList(
|
||||||
|
new Identifier(new StructuredProperty(), new Date(), PidType.original,
|
||||||
|
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())));
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//create the prefix (length = 12): dedup_+ pidType
|
// create the prefix (length = 12): dedup_+ pidType
|
||||||
public static String createPrefix(String pidType) {
|
public static String createPrefix(String pidType) {
|
||||||
|
|
||||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
||||||
|
|
||||||
while (prefix.length() < 12) {
|
while (prefix.length() < 12) {
|
||||||
prefix.append("_");
|
prefix.append("_");
|
||||||
}
|
}
|
||||||
return prefix.toString().substring(0, 12);
|
return prefix.toString().substring(0, 12);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//extracts the date from the record. If the date is not available or is not wellformed, it returns a base date: 00-01-01
|
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
|
||||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf){
|
// 00-01-01
|
||||||
|
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
||||||
|
|
||||||
String date = "2000-01-01";
|
String date = "2000-01-01";
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
Result result = (Result) duplicate;
|
Result result = (Result) duplicate;
|
||||||
if (isWellformed(result.getDateofacceptance())){
|
if (isWellformed(result.getDateofacceptance())) {
|
||||||
date = result.getDateofacceptance().getValue();
|
date = result.getDateofacceptance().getValue();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return sdf.parse(date);
|
return sdf.parse(date);
|
||||||
} catch (ParseException e) {
|
} catch (ParseException e) {
|
||||||
return new Date();
|
return new Date();
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static boolean isWellformed(Field<String> date) {
|
public static boolean isWellformed(Field<String> date) {
|
||||||
return date != null && StringUtils.isNotBlank(date.getValue()) && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||||
}
|
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,132 +1,138 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class Identifier implements Serializable, Comparable<Identifier>{
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
StructuredProperty pid;
|
public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
Date date;
|
|
||||||
PidType type;
|
|
||||||
List<KeyValue> collectedFrom;
|
|
||||||
EntityType entityType;
|
|
||||||
String originalID;
|
|
||||||
|
|
||||||
boolean useOriginal = false; //to know if the top identifier won because of the alphabetical order of the original ID
|
StructuredProperty pid;
|
||||||
|
Date date;
|
||||||
|
PidType type;
|
||||||
|
List<KeyValue> collectedFrom;
|
||||||
|
EntityType entityType;
|
||||||
|
String originalID;
|
||||||
|
|
||||||
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, EntityType entityType, String originalID) {
|
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
|
||||||
this.pid = pid;
|
// ID
|
||||||
this.date = date;
|
|
||||||
this.type = type;
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
this.entityType = entityType;
|
|
||||||
this.originalID = originalID;
|
|
||||||
}
|
|
||||||
|
|
||||||
public StructuredProperty getPid() {
|
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
|
||||||
return pid;
|
EntityType entityType, String originalID) {
|
||||||
}
|
this.pid = pid;
|
||||||
|
this.date = date;
|
||||||
|
this.type = type;
|
||||||
|
this.collectedFrom = collectedFrom;
|
||||||
|
this.entityType = entityType;
|
||||||
|
this.originalID = originalID;
|
||||||
|
}
|
||||||
|
|
||||||
public void setPid(StructuredProperty pidValue) {
|
public StructuredProperty getPid() {
|
||||||
this.pid = pid;
|
return pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getDate() {
|
public void setPid(StructuredProperty pidValue) {
|
||||||
return date;
|
this.pid = pid;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDate(Date date) {
|
public Date getDate() {
|
||||||
this.date = date;
|
return date;
|
||||||
}
|
}
|
||||||
|
|
||||||
public PidType getType() {
|
public void setDate(Date date) {
|
||||||
return type;
|
this.date = date;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setType(PidType type) {
|
public PidType getType() {
|
||||||
this.type = type;
|
return type;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<KeyValue> getCollectedFrom() {
|
public void setType(PidType type) {
|
||||||
return collectedFrom;
|
this.type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
public List<KeyValue> getCollectedFrom() {
|
||||||
this.collectedFrom = collectedFrom;
|
return collectedFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public EntityType getEntityType() {
|
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
||||||
return entityType;
|
this.collectedFrom = collectedFrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setEntityType(EntityType entityType) {
|
public EntityType getEntityType() {
|
||||||
this.entityType = entityType;
|
return entityType;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOriginalID() {
|
public void setEntityType(EntityType entityType) {
|
||||||
return originalID;
|
this.entityType = entityType;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOriginalID(String originalID) {
|
public String getOriginalID() {
|
||||||
this.originalID = originalID;
|
return originalID;
|
||||||
}
|
}
|
||||||
|
|
||||||
public boolean isUseOriginal() {
|
public void setOriginalID(String originalID) {
|
||||||
return useOriginal;
|
this.originalID = originalID;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setUseOriginal(boolean useOriginal) {
|
public boolean isUseOriginal() {
|
||||||
this.useOriginal = useOriginal;
|
return useOriginal;
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
public void setUseOriginal(boolean useOriginal) {
|
||||||
public int compareTo(Identifier i) {
|
this.useOriginal = useOriginal;
|
||||||
//priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) alphabetical order of the originalID
|
}
|
||||||
if (this.getType().compareTo(i.getType()) == 0){ //same type
|
|
||||||
if (entityType == EntityType.publication) {
|
|
||||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
|
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (entityType == EntityType.dataset) {
|
|
||||||
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
|
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID) && !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (this.getDate().compareTo(date) == 0) {//same date
|
@Override
|
||||||
|
public int compareTo(Identifier i) {
|
||||||
|
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||||
|
// alphabetical order of the originalID
|
||||||
|
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
||||||
|
if (entityType == EntityType.publication) {
|
||||||
|
if (isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID)
|
||||||
|
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||||
|
return 1;
|
||||||
|
if (isFromDatasourceID(i.collectedFrom, IdGenerator.CROSSREF_ID)
|
||||||
|
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.CROSSREF_ID))
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
if (entityType == EntityType.dataset) {
|
||||||
|
if (isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID)
|
||||||
|
&& !isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID))
|
||||||
|
return 1;
|
||||||
|
if (isFromDatasourceID(i.collectedFrom, IdGenerator.DATACITE_ID)
|
||||||
|
&& !isFromDatasourceID(this.collectedFrom, IdGenerator.DATACITE_ID))
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
if (this.originalID.compareTo(i.originalID) > 0)
|
if (this.getDate().compareTo(date) == 0) {// same date
|
||||||
this.useOriginal = true;
|
|
||||||
else
|
|
||||||
i.setUseOriginal(true);
|
|
||||||
|
|
||||||
//the minus because we need to take the alphabetically lower id
|
if (this.originalID.compareTo(i.originalID) > 0)
|
||||||
return -this.originalID.compareTo(i.originalID);
|
this.useOriginal = true;
|
||||||
}
|
else
|
||||||
else
|
i.setUseOriginal(true);
|
||||||
//the minus is because we need to take the elder date
|
|
||||||
return -this.getDate().compareTo(date);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
return this.getType().compareTo(i.getType());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
// the minus because we need to take the alphabetically lower id
|
||||||
|
return -this.originalID.compareTo(i.originalID);
|
||||||
|
} else
|
||||||
|
// the minus is because we need to take the elder date
|
||||||
|
return -this.getDate().compareTo(date);
|
||||||
|
} else {
|
||||||
|
return this.getType().compareTo(i.getType());
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId){
|
}
|
||||||
|
|
||||||
for(KeyValue cf: collectedFrom) {
|
public boolean isFromDatasourceID(List<KeyValue> collectedFrom, String dsId) {
|
||||||
if(cf.getKey().equals(dsId))
|
|
||||||
return true;
|
for (KeyValue cf : collectedFrom) {
|
||||||
}
|
if (cf.getKey().equals(dsId))
|
||||||
return false;
|
return true;
|
||||||
}
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,83 +1,98 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
|
||||||
public class OrgSimRel implements Serializable {
|
public class OrgSimRel implements Serializable {
|
||||||
|
|
||||||
String local_id;
|
String local_id;
|
||||||
String oa_original_id;
|
String oa_original_id;
|
||||||
String oa_name;
|
String oa_name;
|
||||||
String oa_acronym;
|
String oa_acronym;
|
||||||
String oa_country;
|
String oa_country;
|
||||||
String oa_url;
|
String oa_url;
|
||||||
String oa_collectedfrom;
|
String oa_collectedfrom;
|
||||||
|
|
||||||
public OrgSimRel() {
|
public OrgSimRel() {
|
||||||
}
|
}
|
||||||
|
|
||||||
public OrgSimRel(String local_id, String oa_original_id, String oa_name, String oa_acronym, String oa_country, String oa_url, String oa_collectedfrom) {
|
public OrgSimRel(String local_id, String oa_original_id, String oa_name, String oa_acronym, String oa_country,
|
||||||
this.local_id = local_id;
|
String oa_url, String oa_collectedfrom) {
|
||||||
this.oa_original_id = oa_original_id;
|
this.local_id = local_id;
|
||||||
this.oa_name = oa_name;
|
this.oa_original_id = oa_original_id;
|
||||||
this.oa_acronym = oa_acronym;
|
this.oa_name = oa_name;
|
||||||
this.oa_country = oa_country;
|
this.oa_acronym = oa_acronym;
|
||||||
this.oa_url = oa_url;
|
this.oa_country = oa_country;
|
||||||
this.oa_collectedfrom = oa_collectedfrom;
|
this.oa_url = oa_url;
|
||||||
}
|
this.oa_collectedfrom = oa_collectedfrom;
|
||||||
|
}
|
||||||
|
|
||||||
public String getLocal_id() {
|
public String getLocal_id() {
|
||||||
return local_id;
|
return local_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setLocal_id(String local_id) {
|
public void setLocal_id(String local_id) {
|
||||||
this.local_id = local_id;
|
this.local_id = local_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_original_id() {
|
public String getOa_original_id() {
|
||||||
return oa_original_id;
|
return oa_original_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_original_id(String oa_original_id) {
|
public void setOa_original_id(String oa_original_id) {
|
||||||
this.oa_original_id = oa_original_id;
|
this.oa_original_id = oa_original_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_name() {
|
public String getOa_name() {
|
||||||
return oa_name;
|
return oa_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_name(String oa_name) {
|
public void setOa_name(String oa_name) {
|
||||||
this.oa_name = oa_name;
|
this.oa_name = oa_name;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_acronym() {
|
public String getOa_acronym() {
|
||||||
return oa_acronym;
|
return oa_acronym;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_acronym(String oa_acronym) {
|
public void setOa_acronym(String oa_acronym) {
|
||||||
this.oa_acronym = oa_acronym;
|
this.oa_acronym = oa_acronym;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_country() {
|
public String getOa_country() {
|
||||||
return oa_country;
|
return oa_country;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_country(String oa_country) {
|
public void setOa_country(String oa_country) {
|
||||||
this.oa_country = oa_country;
|
this.oa_country = oa_country;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_url() {
|
public String getOa_url() {
|
||||||
return oa_url;
|
return oa_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_url(String oa_url) {
|
public void setOa_url(String oa_url) {
|
||||||
this.oa_url = oa_url;
|
this.oa_url = oa_url;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOa_collectedfrom() {
|
public String getOa_collectedfrom() {
|
||||||
return oa_collectedfrom;
|
return oa_collectedfrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOa_collectedfrom(String oa_collectedfrom) {
|
public void setOa_collectedfrom(String oa_collectedfrom) {
|
||||||
this.oa_collectedfrom = oa_collectedfrom;
|
this.oa_collectedfrom = oa_collectedfrom;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "OrgSimRel{" +
|
||||||
|
"local_id='" + local_id + '\'' +
|
||||||
|
", oa_original_id='" + oa_original_id + '\'' +
|
||||||
|
", oa_name='" + oa_name + '\'' +
|
||||||
|
", oa_acronym='" + oa_acronym + '\'' +
|
||||||
|
", oa_country='" + oa_country + '\'' +
|
||||||
|
", oa_url='" + oa_url + '\'' +
|
||||||
|
", oa_collectedfrom='" + oa_collectedfrom + '\'' +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,25 +1,17 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
public enum PidType {
|
public enum PidType {
|
||||||
|
|
||||||
//from the less to the more important
|
// from the less to the more important
|
||||||
undefined,
|
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, doi;
|
||||||
original,
|
|
||||||
orcid,
|
|
||||||
ror,
|
|
||||||
grid,
|
|
||||||
pdb,
|
|
||||||
arXiv,
|
|
||||||
pmid,
|
|
||||||
doi;
|
|
||||||
|
|
||||||
public static PidType classidValueOf(String s){
|
public static PidType classidValueOf(String s) {
|
||||||
try {
|
try {
|
||||||
return PidType.valueOf(s);
|
return PidType.valueOf(s);
|
||||||
}
|
} catch (Exception e) {
|
||||||
catch (Exception e) {
|
return PidType.undefined;
|
||||||
return PidType.undefined;
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,21 +1,5 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.sql.*;
|
|
||||||
import org.dom4j.DocumentException;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
@ -24,153 +8,177 @@ import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.spark.SparkConf;
|
||||||
|
import org.apache.spark.api.java.JavaPairRDD;
|
||||||
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.sql.*;
|
||||||
|
import org.dom4j.DocumentException;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class SparkCollectSimRels extends AbstractSparkAction {
|
public class SparkCollectSimRels extends AbstractSparkAction {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkCollectSimRels.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkCollectSimRels.class);
|
||||||
|
|
||||||
Dataset<Row> simGroupsDS;
|
Dataset<Row> simGroupsDS;
|
||||||
Dataset<Row> groupsDS;
|
Dataset<Row> groupsDS;
|
||||||
|
|
||||||
public SparkCollectSimRels(ArgumentApplicationParser parser, SparkSession spark, Dataset<Row> simGroupsDS, Dataset<Row> groupsDS) {
|
public SparkCollectSimRels(ArgumentApplicationParser parser, SparkSession spark, Dataset<Row> simGroupsDS,
|
||||||
super(parser, spark);
|
Dataset<Row> groupsDS) {
|
||||||
this.simGroupsDS = simGroupsDS;
|
super(parser, spark);
|
||||||
this.groupsDS = groupsDS;
|
this.simGroupsDS = simGroupsDS;
|
||||||
}
|
this.groupsDS = groupsDS;
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
SparkBlockStats.class
|
SparkBlockStats.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
SparkConf conf = new SparkConf();
|
||||||
|
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
final String dbUrl = parser.get("postgresUrl");
|
||||||
final String dbUser = parser.get("postgresUser");
|
final String dbUser = parser.get("postgresUser");
|
||||||
final String dbPassword = parser.get("postgresPassword");
|
final String dbPassword = parser.get("postgresPassword");
|
||||||
|
|
||||||
SparkSession spark = getSparkSession(conf);
|
SparkSession spark = getSparkSession(conf);
|
||||||
|
|
||||||
DataFrameReader readOptions = spark.read()
|
DataFrameReader readOptions = spark
|
||||||
.format("jdbc")
|
.read()
|
||||||
.option("url", dbUrl)
|
.format("jdbc")
|
||||||
.option("user", dbUser)
|
.option("url", dbUrl)
|
||||||
.option("password", dbPassword);
|
.option("user", dbUser)
|
||||||
|
.option("password", dbPassword);
|
||||||
|
|
||||||
new SparkCollectSimRels(
|
new SparkCollectSimRels(
|
||||||
parser,
|
parser,
|
||||||
spark,
|
spark,
|
||||||
readOptions.option("dbtable", "similarity_groups").load(),
|
readOptions.option("dbtable", "similarity_groups").load(),
|
||||||
readOptions.option("dbtable", "groups").load()
|
readOptions.option("dbtable", "groups").load())
|
||||||
).run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
void run(ISLookUpService isLookUpService) throws DocumentException, ISLookUpException, IOException {
|
void run(ISLookUpService isLookUpService) throws DocumentException, ISLookUpException, IOException {
|
||||||
|
|
||||||
// read oozie parameters
|
// read oozie parameters
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
final String actionSetId = parser.get("actionSetId");
|
final String actionSetId = parser.get("actionSetId");
|
||||||
final String workingPath = parser.get("workingPath");
|
final String workingPath = parser.get("workingPath");
|
||||||
final int numPartitions = Optional
|
final int numPartitions = Optional
|
||||||
.ofNullable(parser.get("numPartitions"))
|
.ofNullable(parser.get("numPartitions"))
|
||||||
.map(Integer::valueOf)
|
.map(Integer::valueOf)
|
||||||
.orElse(NUM_PARTITIONS);
|
.orElse(NUM_PARTITIONS);
|
||||||
final String dbUrl = parser.get("postgresUrl");
|
final String dbUrl = parser.get("postgresUrl");
|
||||||
final String dbUser = parser.get("postgresUser");
|
final String dbUser = parser.get("postgresUser");
|
||||||
|
|
||||||
log.info("numPartitions: '{}'", numPartitions);
|
log.info("numPartitions: '{}'", numPartitions);
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
log.info("workingPath: '{}'", workingPath);
|
log.info("workingPath: '{}'", workingPath);
|
||||||
log.info("postgresUser: {}", dbUser);
|
log.info("postgresUser: {}", dbUser);
|
||||||
log.info("postgresUrl: {}", dbUrl);
|
log.info("postgresUrl: {}", dbUrl);
|
||||||
log.info("postgresPassword: xxx");
|
log.info("postgresPassword: xxx");
|
||||||
|
|
||||||
JavaPairRDD<String, List<String>> similarityGroup =
|
JavaPairRDD<String, List<String>> similarityGroup = simGroupsDS
|
||||||
simGroupsDS
|
.toJavaRDD()
|
||||||
.toJavaRDD()
|
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)))
|
||||||
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)))
|
.groupByKey()
|
||||||
.groupByKey()
|
.mapToPair(
|
||||||
.mapToPair(i -> new Tuple2<>(i._1(), StreamSupport.stream(i._2().spliterator(), false)
|
i -> new Tuple2<>(i._1(), StreamSupport
|
||||||
.collect(Collectors.toList())));
|
.stream(i._2().spliterator(), false)
|
||||||
|
.collect(Collectors.toList())));
|
||||||
|
|
||||||
JavaPairRDD<String, String> groupIds =
|
JavaPairRDD<String, String> groupIds = groupsDS
|
||||||
groupsDS
|
.toJavaRDD()
|
||||||
.toJavaRDD()
|
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)));
|
||||||
.mapToPair(r -> new Tuple2<>(r.getString(0), r.getString(1)));
|
|
||||||
|
|
||||||
JavaRDD<Tuple2<Tuple2<String, String>, List<String>>> groups = similarityGroup
|
JavaRDD<Tuple2<Tuple2<String, String>, List<String>>> groups = similarityGroup
|
||||||
.leftOuterJoin(groupIds)
|
.leftOuterJoin(groupIds)
|
||||||
.filter(g -> g._2()._2().isPresent())
|
.filter(g -> g._2()._2().isPresent())
|
||||||
.map(g -> new Tuple2<>(new Tuple2<>(g._1(), g._2()._2().get()), g._2()._1()));
|
.map(g -> new Tuple2<>(new Tuple2<>(g._1(), g._2()._2().get()), g._2()._1()));
|
||||||
|
|
||||||
JavaRDD<Relation> relations = groups.flatMap(g -> {
|
JavaRDD<Relation> relations = groups.flatMap(g -> {
|
||||||
String firstId = g._2().get(0);
|
String firstId = g._2().get(0);
|
||||||
List<Relation> rels = new ArrayList<>();
|
List<Relation> rels = new ArrayList<>();
|
||||||
|
|
||||||
for (String id : g._2()) {
|
for (String id : g._2()) {
|
||||||
if (!firstId.equals(id))
|
if (!firstId.equals(id))
|
||||||
rels.add(createSimRel(firstId, id, g._1()._2()));
|
rels.add(createSimRel(firstId, id, g._1()._2()));
|
||||||
}
|
}
|
||||||
|
|
||||||
return rels.iterator();
|
return rels.iterator();
|
||||||
});
|
});
|
||||||
|
|
||||||
Dataset<Relation> resultRelations = spark.createDataset(
|
Dataset<Relation> resultRelations = spark
|
||||||
relations.filter(r -> r.getRelType().equals("resultResult")).rdd(),
|
.createDataset(
|
||||||
Encoders.bean(Relation.class)
|
relations.filter(r -> r.getRelType().equals("resultResult")).rdd(),
|
||||||
).repartition(numPartitions);
|
Encoders.bean(Relation.class))
|
||||||
|
.repartition(numPartitions);
|
||||||
|
|
||||||
Dataset<Relation> organizationRelations = spark.createDataset(
|
Dataset<Relation> organizationRelations = spark
|
||||||
relations.filter(r -> r.getRelType().equals("organizationOrganization")).rdd(),
|
.createDataset(
|
||||||
Encoders.bean(Relation.class)
|
relations.filter(r -> r.getRelType().equals("organizationOrganization")).rdd(),
|
||||||
).repartition(numPartitions);
|
Encoders.bean(Relation.class))
|
||||||
|
.repartition(numPartitions);
|
||||||
|
|
||||||
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
for (DedupConfig dedupConf : getConfigurations(isLookUpService, actionSetId)) {
|
||||||
switch(dedupConf.getWf().getSubEntityValue()){
|
switch (dedupConf.getWf().getSubEntityValue()) {
|
||||||
case "organization":
|
case "organization":
|
||||||
savePostgresRelation(organizationRelations, workingPath, actionSetId, "organization");
|
savePostgresRelation(organizationRelations, workingPath, actionSetId, "organization");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
savePostgresRelation(resultRelations, workingPath, actionSetId, dedupConf.getWf().getSubEntityValue());
|
savePostgresRelation(
|
||||||
break;
|
resultRelations, workingPath, actionSetId, dedupConf.getWf().getSubEntityValue());
|
||||||
}
|
break;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private Relation createSimRel(String source, String target, String entity) {
|
private Relation createSimRel(String source, String target, String entity) {
|
||||||
final Relation r = new Relation();
|
final Relation r = new Relation();
|
||||||
r.setSubRelType("dedupSimilarity");
|
r.setSubRelType("dedupSimilarity");
|
||||||
r.setRelClass("isSimilarTo");
|
r.setRelClass("isSimilarTo");
|
||||||
r.setDataInfo(new DataInfo());
|
r.setDataInfo(new DataInfo());
|
||||||
|
|
||||||
switch (entity) {
|
switch (entity) {
|
||||||
case "result":
|
case "result":
|
||||||
r.setSource("50|" + source);
|
r.setSource("50|" + source);
|
||||||
r.setTarget("50|" + target);
|
r.setTarget("50|" + target);
|
||||||
r.setRelType("resultResult");
|
r.setRelType("resultResult");
|
||||||
break;
|
break;
|
||||||
case "organization":
|
case "organization":
|
||||||
r.setSource("20|" + source);
|
r.setSource("20|" + source);
|
||||||
r.setTarget("20|" + target);
|
r.setTarget("20|" + target);
|
||||||
r.setRelType("organizationOrganization");
|
r.setRelType("organizationOrganization");
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
throw new IllegalArgumentException("unmanaged entity type: " + entity);
|
||||||
}
|
}
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
private void savePostgresRelation(Dataset<Relation> newRelations, String workingPath, String actionSetId, String entityType) {
|
private void savePostgresRelation(Dataset<Relation> newRelations, String workingPath, String actionSetId,
|
||||||
newRelations
|
String entityType) {
|
||||||
.write()
|
newRelations
|
||||||
.mode(SaveMode.Append)
|
.write()
|
||||||
.parquet(DedupUtility.createSimRelPath(workingPath, actionSetId, entityType));
|
.mode(SaveMode.Append)
|
||||||
}
|
.parquet(DedupUtility.createSimRelPath(workingPath, actionSetId, entityType));
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -104,13 +104,13 @@ public class SparkCreateMergeRels extends AbstractSparkAction {
|
||||||
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
.map(s -> MapDocumentUtil.getJPathString(dedupConf.getWf().getIdPath(), s))
|
||||||
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
.mapToPair((PairFunction<String, Object, String>) s -> new Tuple2<>(hash(s), s));
|
||||||
|
|
||||||
final RDD<Edge<String>> edgeRdd = spark
|
final RDD<Edge<String>> edgeRdd = spark
|
||||||
.read()
|
.read()
|
||||||
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
.load(DedupUtility.createSimRelPath(workingPath, actionSetId, subEntity))
|
||||||
.as(Encoders.bean(Relation.class))
|
.as(Encoders.bean(Relation.class))
|
||||||
.javaRDD()
|
.javaRDD()
|
||||||
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
|
.map(it -> new Edge<>(hash(it.getSource()), hash(it.getTarget()), it.getRelClass()))
|
||||||
.rdd();
|
.rdd();
|
||||||
|
|
||||||
final Dataset<Relation> mergeRels = spark
|
final Dataset<Relation> mergeRels = spark
|
||||||
.createDataset(
|
.createDataset(
|
||||||
|
|
|
@ -100,17 +100,17 @@ public class SparkCreateSimRels extends AbstractSparkAction {
|
||||||
.repartition(numPartitions);
|
.repartition(numPartitions);
|
||||||
|
|
||||||
// create relations by comparing only elements in the same group
|
// create relations by comparing only elements in the same group
|
||||||
spark.createDataset(
|
spark
|
||||||
Deduper
|
.createDataset(
|
||||||
.computeRelations(sc, blocks, dedupConf)
|
Deduper
|
||||||
.map(t -> createSimRel(t._1(), t._2(), entity))
|
.computeRelations(sc, blocks, dedupConf)
|
||||||
.repartition(numPartitions)
|
.map(t -> createSimRel(t._1(), t._2(), entity))
|
||||||
.rdd(),
|
.repartition(numPartitions)
|
||||||
Encoders.bean(Relation.class)
|
.rdd(),
|
||||||
)
|
Encoders.bean(Relation.class))
|
||||||
.write()
|
.write()
|
||||||
.mode(SaveMode.Append)
|
.mode(SaveMode.Append)
|
||||||
.parquet(outputPath);
|
.parquet(outputPath);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,11 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import static jdk.nashorn.internal.objects.NativeDebug.map;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import java.io.IOException;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import java.util.*;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||||
import org.apache.http.client.methods.HttpGet;
|
import org.apache.http.client.methods.HttpGet;
|
||||||
|
@ -15,6 +13,7 @@ import org.apache.http.impl.client.CloseableHttpClient;
|
||||||
import org.apache.http.impl.client.HttpClients;
|
import org.apache.http.impl.client.HttpClients;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.api.java.function.MapGroupsFunction;
|
import org.apache.spark.api.java.function.MapGroupsFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -24,145 +23,172 @@ import org.apache.spark.sql.SparkSession;
|
||||||
import org.dom4j.DocumentException;
|
import org.dom4j.DocumentException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import java.util.ArrayList;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import java.util.Iterator;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import java.util.List;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import java.util.Properties;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.config.DedupConfig;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
public class SparkPrepareOrgRels extends AbstractSparkAction {
|
public class SparkPrepareOrgRels extends AbstractSparkAction {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkCreateDedupRecord.class);
|
||||||
|
|
||||||
public static final String ROOT_TRUST = "0.8";
|
public SparkPrepareOrgRels(ArgumentApplicationParser parser, SparkSession spark) {
|
||||||
public static final String PROVENANCE_ACTION_CLASS = "sysimport:dedup";
|
super(parser, spark);
|
||||||
public static final String PROVENANCE_ACTIONS = "dnet:provenanceActions";
|
}
|
||||||
|
|
||||||
public SparkPrepareOrgRels(ArgumentApplicationParser parser, SparkSession spark) {
|
public static void main(String[] args) throws Exception {
|
||||||
super(parser, spark);
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
}
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SparkCreateSimRels.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/dedup/prepareOrgRels_parameters.json")));
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
SparkConf conf = new SparkConf();
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||||
IOUtils
|
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||||
.toString(
|
|
||||||
SparkCreateSimRels.class
|
|
||||||
.getResourceAsStream(
|
|
||||||
"/eu/dnetlib/dhp/oa/dedup/prepareOrgRels_parameters.json")));
|
|
||||||
parser.parseArgument(args);
|
|
||||||
|
|
||||||
SparkConf conf = new SparkConf();
|
new SparkPrepareOrgRels(parser, getSparkSession(conf))
|
||||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
||||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
}
|
||||||
|
|
||||||
new SparkCreateDedupRecord(parser, getSparkSession(conf))
|
@Override
|
||||||
.run(ISLookupClientFactory.getLookUpService(parser.get("isLookUpUrl")));
|
public void run(ISLookUpService isLookUpService) throws IOException {
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
final String graphBasePath = parser.get("graphBasePath");
|
||||||
public void run(ISLookUpService isLookUpService) throws IOException {
|
final String isLookUpUrl = parser.get("isLookUpUrl");
|
||||||
|
final String actionSetId = parser.get("actionSetId");
|
||||||
|
final String workingPath = parser.get("workingPath");
|
||||||
|
final int numConnections = Optional
|
||||||
|
.ofNullable(parser.get("numConnections"))
|
||||||
|
.map(Integer::valueOf)
|
||||||
|
.orElse(NUM_CONNECTIONS);
|
||||||
|
|
||||||
final String graphBasePath = parser.get("graphBasePath");
|
final String apiUrl = Optional
|
||||||
final String isLookUpUrl = parser.get("isLookUpUrl");
|
.ofNullable(parser.get("apiUrl"))
|
||||||
final String actionSetId = parser.get("actionSetId");
|
.orElse("");
|
||||||
final String workingPath = parser.get("workingPath");
|
|
||||||
final String apiUrl = parser.get("apiUrl");
|
|
||||||
final String dbUrl = parser.get("dbUrl");
|
|
||||||
final String dbTable = parser.get("dbTable");
|
|
||||||
final String dbUser = parser.get("dbUser");
|
|
||||||
final String dbPwd = parser.get("dbPwd");
|
|
||||||
|
|
||||||
log.info("graphBasePath: '{}'", graphBasePath);
|
final String dbUrl = parser.get("dbUrl");
|
||||||
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
final String dbTable = parser.get("dbTable");
|
||||||
log.info("actionSetId: '{}'", actionSetId);
|
final String dbUser = parser.get("dbUser");
|
||||||
log.info("workingPath: '{}'", workingPath);
|
final String dbPwd = parser.get("dbPwd");
|
||||||
log.info("apiUrl: '{}'", apiUrl);
|
|
||||||
log.info("dbUrl: '{}'", dbUrl);
|
|
||||||
log.info("dbUser: '{}'", dbUser);
|
|
||||||
log.info("table: '{}'", dbTable);
|
|
||||||
log.info("dbPwd: '{}'", "xxx");
|
|
||||||
|
|
||||||
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
|
log.info("graphBasePath: '{}'", graphBasePath);
|
||||||
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
|
log.info("isLookUpUrl: '{}'", isLookUpUrl);
|
||||||
|
log.info("actionSetId: '{}'", actionSetId);
|
||||||
|
log.info("workingPath: '{}'", workingPath);
|
||||||
|
log.info("numPartitions: '{}'", numConnections);
|
||||||
|
log.info("apiUrl: '{}'", apiUrl);
|
||||||
|
log.info("dbUrl: '{}'", dbUrl);
|
||||||
|
log.info("dbUser: '{}'", dbUser);
|
||||||
|
log.info("table: '{}'", dbTable);
|
||||||
|
log.info("dbPwd: '{}'", "xxx");
|
||||||
|
|
||||||
Dataset<OrgSimRel> relations = createRelations(spark, mergeRelPath, entityPath);
|
final String mergeRelPath = DedupUtility.createMergeRelPath(workingPath, actionSetId, "organization");
|
||||||
|
final String entityPath = DedupUtility.createEntityPath(graphBasePath, "organization");
|
||||||
|
|
||||||
final Properties connectionProperties = new Properties();
|
Dataset<OrgSimRel> relations = createRelations(spark, mergeRelPath, entityPath);
|
||||||
connectionProperties.put("user", dbUser);
|
|
||||||
connectionProperties.put("password", dbPwd);
|
|
||||||
|
|
||||||
relations.write().mode(SaveMode.Overwrite).jdbc(dbUrl, dbTable, connectionProperties);
|
final Properties connectionProperties = new Properties();
|
||||||
|
connectionProperties.put("user", dbUser);
|
||||||
|
connectionProperties.put("password", dbPwd);
|
||||||
|
|
||||||
if (!apiUrl.isEmpty())
|
relations
|
||||||
updateSimRels(apiUrl);
|
.repartition(numConnections)
|
||||||
|
.write()
|
||||||
|
.mode(SaveMode.Overwrite)
|
||||||
|
.jdbc(dbUrl, dbTable, connectionProperties);
|
||||||
|
|
||||||
}
|
if (!apiUrl.isEmpty())
|
||||||
|
updateSimRels(apiUrl);
|
||||||
|
|
||||||
public static Dataset<OrgSimRel> createRelations(
|
}
|
||||||
final SparkSession spark,
|
|
||||||
final String mergeRelsPath,
|
|
||||||
final String entitiesPath) {
|
|
||||||
|
|
||||||
// <id, json_entity>
|
public static Dataset<OrgSimRel> createRelations(
|
||||||
Dataset<Tuple2<String, Organization>> entities = spark
|
final SparkSession spark,
|
||||||
.read()
|
final String mergeRelsPath,
|
||||||
.textFile(entitiesPath)
|
final String entitiesPath) {
|
||||||
.map(
|
|
||||||
(MapFunction<String, Tuple2<String, Organization>>) it -> {
|
|
||||||
Organization entity = OBJECT_MAPPER.readValue(it, Organization.class);
|
|
||||||
return new Tuple2<>(entity.getId(), entity);
|
|
||||||
},
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Organization.class)));
|
|
||||||
|
|
||||||
Dataset<Tuple2<String, String>> relations = spark.createDataset(
|
// <id, json_entity>
|
||||||
spark
|
Dataset<Tuple2<String, Organization>> entities = spark
|
||||||
.read()
|
.read()
|
||||||
.load(mergeRelsPath)
|
.textFile(entitiesPath)
|
||||||
.as(Encoders.bean(Relation.class))
|
.map(
|
||||||
.where("relClass == 'merges'")
|
(MapFunction<String, Tuple2<String, Organization>>) it -> {
|
||||||
.toJavaRDD()
|
Organization entity = OBJECT_MAPPER.readValue(it, Organization.class);
|
||||||
.mapToPair(r -> new Tuple2<>(r.getSource(), r.getTarget()))
|
return new Tuple2<>(entity.getId(), entity);
|
||||||
.groupByKey()
|
},
|
||||||
.flatMap(g -> {
|
Encoders.tuple(Encoders.STRING(), Encoders.kryo(Organization.class)));
|
||||||
List<Tuple2<String, String>> rels = new ArrayList<>();
|
|
||||||
for (String id1 : g._2()) {
|
|
||||||
for (String id2 : g._2()) {
|
|
||||||
if (!id1.equals(id2))
|
|
||||||
if (id1.contains("openorgs"))
|
|
||||||
rels.add(new Tuple2<>(id1, id2));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rels.iterator();
|
|
||||||
}).rdd(),
|
|
||||||
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
|
||||||
|
|
||||||
return relations
|
Dataset<Tuple2<String, String>> relations = spark
|
||||||
.joinWith(entities, relations.col("_2").equalTo(entities.col("_1")), "inner")
|
.createDataset(
|
||||||
.map(
|
spark
|
||||||
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, Organization>>, OrgSimRel>)r ->
|
.read()
|
||||||
new OrgSimRel(
|
.load(mergeRelsPath)
|
||||||
r._1()._2(),
|
.as(Encoders.bean(Relation.class))
|
||||||
r._2()._2().getOriginalId().get(0),
|
.where("relClass == 'merges'")
|
||||||
r._2()._2().getLegalname().getValue(),
|
.toJavaRDD()
|
||||||
r._2()._2().getLegalshortname().getValue(),
|
.mapToPair(r -> new Tuple2<>(r.getSource(), r.getTarget()))
|
||||||
r._2()._2().getCountry().getClassid(),
|
.groupByKey()
|
||||||
r._2()._2().getWebsiteurl().getValue(),
|
.flatMap(g -> {
|
||||||
r._2()._2().getCollectedfrom().get(0).getValue()
|
List<Tuple2<String, String>> rels = new ArrayList<>();
|
||||||
),
|
for (String id1 : g._2()) {
|
||||||
Encoders.bean(OrgSimRel.class)
|
for (String id2 : g._2()) {
|
||||||
);
|
if (!id1.equals(id2))
|
||||||
|
if (id1.contains("openorgs____") && !id2.contains("openorgsmesh"))
|
||||||
|
rels.add(new Tuple2<>(id1, id2));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rels.iterator();
|
||||||
|
})
|
||||||
|
.rdd(),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
|
||||||
|
|
||||||
}
|
Dataset<Tuple2<String, OrgSimRel>> relations2 = relations // <openorgs, corda>
|
||||||
|
.joinWith(entities, relations.col("_2").equalTo(entities.col("_1")), "inner")
|
||||||
|
.map(
|
||||||
|
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, Organization>>, OrgSimRel>) r -> new OrgSimRel(
|
||||||
|
r._1()._1(),
|
||||||
|
r._2()._2().getOriginalId().get(0),
|
||||||
|
r._2()._2().getLegalname() != null ? r._2()._2().getLegalname().getValue() : "",
|
||||||
|
r._2()._2().getLegalshortname() != null ? r._2()._2().getLegalshortname().getValue() : "",
|
||||||
|
r._2()._2().getCountry() != null ? r._2()._2().getCountry().getClassid() : "",
|
||||||
|
r._2()._2().getWebsiteurl() != null ? r._2()._2().getWebsiteurl().getValue() : "",
|
||||||
|
r._2()._2().getCollectedfrom().get(0).getValue()),
|
||||||
|
Encoders.bean(OrgSimRel.class))
|
||||||
|
.map(
|
||||||
|
(MapFunction<OrgSimRel, Tuple2<String, OrgSimRel>>) o -> new Tuple2<>(o.getLocal_id(), o),
|
||||||
|
Encoders.tuple(Encoders.STRING(), Encoders.bean(OrgSimRel.class)));
|
||||||
|
|
||||||
private static String updateSimRels(final String apiUrl) throws IOException {
|
return relations2
|
||||||
final HttpGet req = new HttpGet(apiUrl);
|
.joinWith(entities, relations2.col("_1").equalTo(entities.col("_1")), "inner")
|
||||||
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
.map(
|
||||||
try (final CloseableHttpResponse response = client.execute(req)) {
|
(MapFunction<Tuple2<Tuple2<String, OrgSimRel>, Tuple2<String, Organization>>, OrgSimRel>) r -> {
|
||||||
return IOUtils.toString(response.getEntity().getContent());
|
OrgSimRel orgSimRel = r._1()._2();
|
||||||
}
|
orgSimRel.setLocal_id(r._2()._2().getOriginalId().get(0));
|
||||||
}
|
return orgSimRel;
|
||||||
}
|
},
|
||||||
|
Encoders.bean(OrgSimRel.class));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String updateSimRels(final String apiUrl) throws IOException {
|
||||||
|
|
||||||
|
log.info("Updating simrels on the portal");
|
||||||
|
|
||||||
|
final HttpGet req = new HttpGet(apiUrl);
|
||||||
|
try (final CloseableHttpClient client = HttpClients.createDefault()) {
|
||||||
|
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||||
|
return IOUtils.toString(response.getEntity().getContent());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
<workflow-app name="Duplicate Scan" xmlns="uri:oozie:workflow:0.5">
|
<workflow-app name="Organization Dedup" xmlns="uri:oozie:workflow:0.5">
|
||||||
<parameters>
|
<parameters>
|
||||||
<property>
|
<property>
|
||||||
<name>graphBasePath</name>
|
<name>graphBasePath</name>
|
||||||
|
@ -24,10 +24,6 @@
|
||||||
<name>cutConnectedComponent</name>
|
<name>cutConnectedComponent</name>
|
||||||
<description>max number of elements in a connected component</description>
|
<description>max number of elements in a connected component</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>apiUrl</name>
|
|
||||||
<description>the url for the APIs of the openorgs service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>dbUrl</name>
|
<name>dbUrl</name>
|
||||||
<description>the url of the database</description>
|
<description>the url of the database</description>
|
||||||
|
@ -109,6 +105,16 @@
|
||||||
<fs>
|
<fs>
|
||||||
<delete path="${workingPath}"/>
|
<delete path="${workingPath}"/>
|
||||||
</fs>
|
</fs>
|
||||||
|
<ok to="copyRelations"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="copyRelations">
|
||||||
|
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
||||||
|
<arg>-pb</arg>
|
||||||
|
<arg>${graphBasePath}/relation</arg>
|
||||||
|
<arg>${workingPath}/${actionSetId}/organization_simrel</arg>
|
||||||
|
</distcp>
|
||||||
<ok to="CreateSimRel"/>
|
<ok to="CreateSimRel"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
@ -136,16 +142,6 @@
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
<arg>--numPartitions</arg><arg>8000</arg>
|
<arg>--numPartitions</arg><arg>8000</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="copyRelations"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="copyRelations">
|
|
||||||
<distcp xmlns="uri:oozie:distcp-action:0.2">
|
|
||||||
<arg>-pb</arg>
|
|
||||||
<arg>${graphBasePath}/relation</arg>
|
|
||||||
<arg>${workingPath}/organization_simrel</arg>
|
|
||||||
</distcp>
|
|
||||||
<ok to="CreateMergeRel"/>
|
<ok to="CreateMergeRel"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
@ -203,6 +199,7 @@
|
||||||
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
<arg>--dbTable</arg><arg>${dbTable}</arg>
|
||||||
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
<arg>--dbUser</arg><arg>${dbUser}</arg>
|
||||||
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
<arg>--dbPwd</arg><arg>${dbPwd}</arg>
|
||||||
|
<arg>--numConnections</arg><arg>20</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -23,11 +23,17 @@
|
||||||
"paramDescription": "the id of the actionset (orchestrator)",
|
"paramDescription": "the id of the actionset (orchestrator)",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "nc",
|
||||||
|
"paramLongName": "numConnections",
|
||||||
|
"paramDescription": "number of connections to the postgres db (for the write operation)",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "au",
|
"paramName": "au",
|
||||||
"paramLongName": "apiUrl",
|
"paramLongName": "apiUrl",
|
||||||
"paramDescription": "the url for the APIs of the openorgs service",
|
"paramDescription": "the url for the APIs of the openorgs service",
|
||||||
"paramRequired": true
|
"paramRequired": false
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"paramName": "du",
|
"paramName": "du",
|
||||||
|
|
|
@ -138,10 +138,10 @@ public class EntityMergerTest implements Serializable {
|
||||||
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
public void publicationMergerTest3() throws InstantiationException, IllegalAccessException {
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals( "50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -149,7 +149,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
public void publicationMergerTest4() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
||||||
|
@ -160,7 +160,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
||||||
|
|
|
@ -1,11 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import static org.apache.spark.sql.functions.count;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import static org.mockito.Mockito.lenient;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -21,19 +28,16 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
|
|
||||||
import static java.nio.file.Files.createTempDirectory;
|
|
||||||
import static org.apache.spark.sql.functions.count;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.mockito.Mockito.lenient;
|
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class SparkDedupTest implements Serializable {
|
public class SparkDedupTest implements Serializable {
|
||||||
|
@ -48,7 +52,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
private static String testOutputBasePath;
|
private static String testOutputBasePath;
|
||||||
private static String testDedupGraphBasePath;
|
private static String testDedupGraphBasePath;
|
||||||
private static final String testActionSetId = "test-orchestrator";
|
private static final String testActionSetId = "test-orchestrator";
|
||||||
private static String testDedupAssertionsBasePath;
|
private static String testDedupAssertionsBasePath;
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void cleanUp() throws IOException, URISyntaxException {
|
public static void cleanUp() throws IOException, URISyntaxException {
|
||||||
|
@ -64,9 +68,9 @@ public class SparkDedupTest implements Serializable {
|
||||||
.toAbsolutePath()
|
.toAbsolutePath()
|
||||||
.toString();
|
.toString();
|
||||||
testDedupAssertionsBasePath = Paths
|
testDedupAssertionsBasePath = Paths
|
||||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/assertions").toURI())
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/assertions").toURI())
|
||||||
.toFile()
|
.toFile()
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||||
|
@ -82,7 +86,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws IOException, ISLookUpException {
|
public void setUp() throws IOException, ISLookUpException {
|
||||||
|
@ -165,98 +169,98 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
new SparkCreateSimRels(parser, spark).run(isLookUpService);
|
new SparkCreateSimRels(parser, spark).run(isLookUpService);
|
||||||
|
|
||||||
long orgs_simrel = spark
|
long orgs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long pubs_simrel = spark
|
long pubs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long sw_simrel = spark
|
long sw_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long ds_simrel = spark
|
long ds_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long orp_simrel = spark
|
long orp_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(3432, orgs_simrel);
|
assertEquals(3082, orgs_simrel);
|
||||||
assertEquals(7152, pubs_simrel);
|
assertEquals(7036, pubs_simrel);
|
||||||
assertEquals(344, sw_simrel);
|
assertEquals(344, sw_simrel);
|
||||||
assertEquals(458, ds_simrel);
|
assertEquals(442, ds_simrel);
|
||||||
assertEquals(6750, orp_simrel);
|
assertEquals(6750, orp_simrel);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(2)
|
@Order(2)
|
||||||
public void collectSimRelsTest() throws Exception {
|
public void collectSimRelsTest() throws Exception {
|
||||||
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
SparkCreateSimRels.class
|
SparkCollectSimRels.class
|
||||||
.getResourceAsStream(
|
.getResourceAsStream(
|
||||||
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
"/eu/dnetlib/dhp/oa/dedup/collectSimRels_parameters.json")));
|
||||||
parser
|
parser
|
||||||
.parseArgument(
|
.parseArgument(
|
||||||
new String[] {
|
new String[] {
|
||||||
"-asi", testActionSetId,
|
"-asi", testActionSetId,
|
||||||
"-la", "lookupurl",
|
"-la", "lookupurl",
|
||||||
"-w", testOutputBasePath,
|
"-w", testOutputBasePath,
|
||||||
"-np", "50",
|
"-np", "50",
|
||||||
"-purl", "jdbc:postgresql://localhost:5432/dnet_dedup",
|
"-purl", "jdbc:postgresql://localhost:5432/dnet_dedup",
|
||||||
"-pusr", "postgres_url",
|
"-pusr", "postgres_user",
|
||||||
"-ppwd", ""
|
"-ppwd", ""
|
||||||
});
|
});
|
||||||
|
|
||||||
new SparkCollectSimRels(
|
new SparkCollectSimRels(
|
||||||
parser,
|
parser,
|
||||||
spark,
|
spark,
|
||||||
spark.read().load(testDedupAssertionsBasePath + "/similarity_groups"),
|
spark.read().load(testDedupAssertionsBasePath + "/similarity_groups"),
|
||||||
spark.read().load(testDedupAssertionsBasePath + "/groups")
|
spark.read().load(testDedupAssertionsBasePath + "/groups"))
|
||||||
).run(null);
|
.run(isLookUpService);
|
||||||
|
|
||||||
long orgs_simrel = spark
|
long orgs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/organization_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long pubs_simrel = spark
|
long pubs_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long sw_simrel = spark
|
long sw_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long ds_simrel = spark
|
long ds_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/dataset_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
long orp_simrel = spark
|
long orp_simrel = spark
|
||||||
.read()
|
.read()
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_simrel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(4022, orgs_simrel);
|
assertEquals(3672, orgs_simrel);
|
||||||
assertEquals(10575, pubs_simrel);
|
assertEquals(10459, pubs_simrel);
|
||||||
assertEquals(3767, sw_simrel);
|
assertEquals(3767, sw_simrel);
|
||||||
assertEquals(3881, ds_simrel);
|
assertEquals(3865, ds_simrel);
|
||||||
assertEquals(10173, orp_simrel);
|
assertEquals(10173, orp_simrel);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(3)
|
@Order(3)
|
||||||
|
@ -402,8 +406,8 @@ public class SparkDedupTest implements Serializable {
|
||||||
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
.load(testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_mergerel")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(1276, orgs_mergerel);
|
assertEquals(1272, orgs_mergerel);
|
||||||
assertEquals(1442, pubs_mergerel);
|
assertEquals(1438, pubs_mergerel);
|
||||||
assertEquals(288, sw_mergerel);
|
assertEquals(288, sw_mergerel);
|
||||||
assertEquals(472, ds_mergerel);
|
assertEquals(472, ds_mergerel);
|
||||||
assertEquals(718, orp_mergerel);
|
assertEquals(718, orp_mergerel);
|
||||||
|
@ -449,10 +453,10 @@ public class SparkDedupTest implements Serializable {
|
||||||
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
testOutputBasePath + "/" + testActionSetId + "/otherresearchproduct_deduprecord")
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(82, orgs_deduprecord);
|
assertEquals(84, orgs_deduprecord);
|
||||||
assertEquals(66, pubs_deduprecord);
|
assertEquals(65, pubs_deduprecord);
|
||||||
assertEquals(51, sw_deduprecord);
|
assertEquals(51, sw_deduprecord);
|
||||||
assertEquals(96, ds_deduprecord);
|
assertEquals(97, ds_deduprecord);
|
||||||
assertEquals(89, orp_deduprecord);
|
assertEquals(89, orp_deduprecord);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -532,12 +536,12 @@ public class SparkDedupTest implements Serializable {
|
||||||
.distinct()
|
.distinct()
|
||||||
.count();
|
.count();
|
||||||
|
|
||||||
assertEquals(897, publications);
|
assertEquals(896, publications);
|
||||||
assertEquals(835, organizations);
|
assertEquals(837, organizations);
|
||||||
assertEquals(100, projects);
|
assertEquals(100, projects);
|
||||||
assertEquals(100, datasource);
|
assertEquals(100, datasource);
|
||||||
assertEquals(200, softwares);
|
assertEquals(200, softwares);
|
||||||
assertEquals(388, dataset);
|
assertEquals(389, dataset);
|
||||||
assertEquals(517, otherresearchproduct);
|
assertEquals(517, otherresearchproduct);
|
||||||
|
|
||||||
long deletedOrgs = jsc
|
long deletedOrgs = jsc
|
||||||
|
@ -592,7 +596,7 @@ public class SparkDedupTest implements Serializable {
|
||||||
|
|
||||||
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
|
||||||
|
|
||||||
assertEquals(4866, relations);
|
assertEquals(4858, relations);
|
||||||
|
|
||||||
// check deletedbyinference
|
// check deletedbyinference
|
||||||
final Dataset<Relation> mergeRels = spark
|
final Dataset<Relation> mergeRels = spark
|
||||||
|
@ -641,11 +645,11 @@ public class SparkDedupTest implements Serializable {
|
||||||
assertEquals(expected_unique, rel.distinct().count());
|
assertEquals(expected_unique, rel.distinct().count());
|
||||||
}
|
}
|
||||||
|
|
||||||
// @AfterAll
|
@AfterAll
|
||||||
// public static void finalCleanUp() throws IOException {
|
public static void finalCleanUp() throws IOException {
|
||||||
// FileUtils.deleteDirectory(new File(testOutputBasePath));
|
FileUtils.deleteDirectory(new File(testOutputBasePath));
|
||||||
// FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
FileUtils.deleteDirectory(new File(testDedupGraphBasePath));
|
||||||
// }
|
}
|
||||||
|
|
||||||
public boolean isDeletedByInference(String s) {
|
public boolean isDeletedByInference(String s) {
|
||||||
return s.contains("\"deletedbyinference\":true");
|
return s.contains("\"deletedbyinference\":true");
|
||||||
|
|
2
pom.xml
2
pom.xml
|
@ -315,7 +315,7 @@
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
<artifactId>dnet-pace-core</artifactId>
|
<artifactId>dnet-pace-core</artifactId>
|
||||||
<version>4.0.4</version>
|
<version>4.0.5</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>eu.dnetlib</groupId>
|
<groupId>eu.dnetlib</groupId>
|
||||||
|
|
Loading…
Reference in New Issue