2020-04-27 14:52:31 +02:00
|
|
|
|
2020-03-27 10:42:17 +01:00
|
|
|
package eu.dnetlib.dhp.oa.dedup.graph;
|
2019-12-06 13:38:00 +01:00
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.Serializable;
|
2021-04-14 18:06:07 +02:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Objects;
|
2019-12-06 13:38:00 +01:00
|
|
|
import java.util.Set;
|
2020-07-13 14:18:42 +02:00
|
|
|
import java.util.stream.Collectors;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2021-04-15 10:59:24 +02:00
|
|
|
import org.apache.commons.lang.StringUtils;
|
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
|
|
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
2021-04-15 10:59:24 +02:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
2021-04-14 18:06:07 +02:00
|
|
|
import com.google.common.collect.Lists;
|
2021-04-15 10:59:24 +02:00
|
|
|
|
|
|
|
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
|
2021-04-14 18:06:07 +02:00
|
|
|
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
|
|
|
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
|
|
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
2021-04-15 10:59:24 +02:00
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
2021-04-14 18:06:07 +02:00
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
2020-04-28 11:23:29 +02:00
|
|
|
import eu.dnetlib.pace.util.PaceException;
|
|
|
|
|
2019-12-06 13:38:00 +01:00
|
|
|
public class ConnectedComponent implements Serializable {
|
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
private String ccId;
|
2021-04-15 10:59:24 +02:00
|
|
|
private Set<String> ids;
|
|
|
|
|
|
|
|
private static final String CONNECTED_COMPONENT_ID_PREFIX = "connect_comp";
|
|
|
|
|
|
|
|
public ConnectedComponent(Set<String> ids, final int cut) {
|
|
|
|
this.ids = ids;
|
|
|
|
|
|
|
|
this.ccId = createDefaultID();
|
|
|
|
|
|
|
|
if (cut > 0 && ids.size() > cut) {
|
|
|
|
this.ids = ids
|
|
|
|
.stream()
|
|
|
|
.filter(id -> !ccId.equalsIgnoreCase(id))
|
|
|
|
.limit(cut - 1)
|
|
|
|
.collect(Collectors.toSet());
|
|
|
|
// this.ids.add(ccId); ??
|
2020-07-13 14:18:42 +02:00
|
|
|
}
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-15 10:59:24 +02:00
|
|
|
public ConnectedComponent(String ccId, Set<String> ids) {
|
|
|
|
this.ccId = ccId;
|
|
|
|
this.ids = ids;
|
|
|
|
}
|
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
public String createDefaultID() {
|
2021-04-15 10:59:24 +02:00
|
|
|
if (ids.size() > 1) {
|
2020-04-27 14:52:31 +02:00
|
|
|
final String s = getMin();
|
|
|
|
String prefix = s.split("\\|")[0];
|
2021-04-15 10:59:24 +02:00
|
|
|
ccId = prefix + "|" + CONNECTED_COMPONENT_ID_PREFIX + "::" + DHPUtils.md5(s);
|
2020-04-27 14:52:31 +02:00
|
|
|
return ccId;
|
|
|
|
} else {
|
2021-04-15 10:59:24 +02:00
|
|
|
return ids.iterator().next();
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
@JsonIgnore
|
|
|
|
public String getMin() {
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
final StringBuilder min = new StringBuilder();
|
2020-07-13 14:18:42 +02:00
|
|
|
|
2021-04-15 10:59:24 +02:00
|
|
|
ids
|
2020-04-27 14:52:31 +02:00
|
|
|
.forEach(
|
2021-04-15 10:59:24 +02:00
|
|
|
id -> {
|
2020-04-27 14:52:31 +02:00
|
|
|
if (StringUtils.isBlank(min.toString())) {
|
2021-04-15 10:59:24 +02:00
|
|
|
min.append(id);
|
2020-04-27 14:52:31 +02:00
|
|
|
} else {
|
2021-04-15 10:59:24 +02:00
|
|
|
if (min.toString().compareTo(id) > 0) {
|
2020-04-27 14:52:31 +02:00
|
|
|
min.setLength(0);
|
2021-04-15 10:59:24 +02:00
|
|
|
min.append(id);
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
return min.toString();
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
@Override
|
|
|
|
public String toString() {
|
|
|
|
ObjectMapper mapper = new ObjectMapper();
|
|
|
|
try {
|
|
|
|
return mapper.writeValueAsString(this);
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw new PaceException("Failed to create Json: ", e);
|
|
|
|
}
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-15 10:59:24 +02:00
|
|
|
public Set<String> getIds() {
|
|
|
|
return ids;
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-15 10:59:24 +02:00
|
|
|
public void setIds(Set<String> ids) {
|
|
|
|
this.ids = ids;
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public String getCcId() {
|
|
|
|
return ccId;
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public void setCcId(String ccId) {
|
|
|
|
this.ccId = ccId;
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
}
|