2020-04-27 14:52:31 +02:00
|
|
|
|
2020-03-27 10:42:17 +01:00
|
|
|
package eu.dnetlib.dhp.oa.dedup.graph;
|
2019-12-06 13:38:00 +01:00
|
|
|
|
|
|
|
import java.io.IOException;
|
|
|
|
import java.io.Serializable;
|
2021-04-14 18:06:07 +02:00
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Objects;
|
2019-12-06 13:38:00 +01:00
|
|
|
import java.util.Set;
|
2020-07-13 14:18:42 +02:00
|
|
|
import java.util.stream.Collectors;
|
2020-04-28 11:23:29 +02:00
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
|
|
import com.google.common.collect.Lists;
|
|
|
|
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
|
|
|
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
|
|
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
|
|
import eu.dnetlib.pace.config.DedupConfig;
|
|
|
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
2020-04-18 12:42:58 +02:00
|
|
|
import org.apache.commons.lang.StringUtils;
|
2021-04-14 18:06:07 +02:00
|
|
|
import org.apache.spark.api.java.function.MapFunction;
|
2020-04-18 12:42:58 +02:00
|
|
|
import org.codehaus.jackson.annotate.JsonIgnore;
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-28 11:23:29 +02:00
|
|
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
|
|
|
|
|
|
import eu.dnetlib.dhp.oa.dedup.DedupUtility;
|
2020-10-20 12:19:46 +02:00
|
|
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
2020-04-28 11:23:29 +02:00
|
|
|
import eu.dnetlib.pace.util.PaceException;
|
|
|
|
|
2019-12-06 13:38:00 +01:00
|
|
|
public class ConnectedComponent implements Serializable {
|
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
private String ccId;
|
2021-04-14 18:06:07 +02:00
|
|
|
private Set<String> entities;
|
|
|
|
|
|
|
|
protected static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
|
|
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
|
|
|
|
|
|
public <T extends OafEntity> ConnectedComponent(Set<String> entities, String subEntity, final int cut) {
|
|
|
|
this.entities = entities;
|
|
|
|
final Class<T> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
|
|
|
|
|
|
|
|
List<Identifier<T>> identifiers = Lists.newArrayList();
|
|
|
|
|
|
|
|
entities.forEach(e -> {
|
|
|
|
try {
|
|
|
|
T entity = OBJECT_MAPPER.readValue(e, clazz);
|
|
|
|
identifiers.add(Identifier.newInstance(entity));
|
|
|
|
} catch (IOException e1) {
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
this.ccId = IdGenerator.generate(
|
|
|
|
identifiers,
|
|
|
|
createDefaultID()
|
|
|
|
);
|
|
|
|
|
|
|
|
if (cut > 0 && entities.size() > cut) {
|
|
|
|
this.entities = entities
|
|
|
|
.stream()
|
|
|
|
.filter(e -> !ccId.equalsIgnoreCase(MapDocumentUtil.getJPathString("$.id", e)))
|
|
|
|
.limit(cut - 1)
|
|
|
|
.collect(Collectors.toSet());
|
2020-07-13 14:18:42 +02:00
|
|
|
}
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
public String createDefaultID() {
|
|
|
|
if (entities.size() > 1) {
|
2020-04-27 14:52:31 +02:00
|
|
|
final String s = getMin();
|
|
|
|
String prefix = s.split("\\|")[0];
|
2020-10-06 16:21:34 +02:00
|
|
|
ccId = prefix + "|dedup_wf_001::" + DHPUtils.md5(s);
|
2020-04-27 14:52:31 +02:00
|
|
|
return ccId;
|
|
|
|
} else {
|
2021-04-14 18:06:07 +02:00
|
|
|
return MapDocumentUtil.getJPathString("$.id", entities.iterator().next());
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
@JsonIgnore
|
|
|
|
public String getMin() {
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
final StringBuilder min = new StringBuilder();
|
2020-07-13 14:18:42 +02:00
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
entities
|
2020-04-27 14:52:31 +02:00
|
|
|
.forEach(
|
2021-04-14 18:06:07 +02:00
|
|
|
e -> {
|
2020-04-27 14:52:31 +02:00
|
|
|
if (StringUtils.isBlank(min.toString())) {
|
2021-04-14 18:06:07 +02:00
|
|
|
min.append(MapDocumentUtil.getJPathString("$.id", e));
|
2020-04-27 14:52:31 +02:00
|
|
|
} else {
|
2021-04-14 18:06:07 +02:00
|
|
|
if (min.toString().compareTo(MapDocumentUtil.getJPathString("$.id", e)) > 0) {
|
2020-04-27 14:52:31 +02:00
|
|
|
min.setLength(0);
|
2021-04-14 18:06:07 +02:00
|
|
|
min.append(MapDocumentUtil.getJPathString("$.id", e));
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
});
|
|
|
|
return min.toString();
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
@Override
|
|
|
|
public String toString() {
|
|
|
|
ObjectMapper mapper = new ObjectMapper();
|
|
|
|
try {
|
|
|
|
return mapper.writeValueAsString(this);
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw new PaceException("Failed to create Json: ", e);
|
|
|
|
}
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
public Set<String> getEntities() {
|
|
|
|
return entities;
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2021-04-14 18:06:07 +02:00
|
|
|
public void setEntities(Set<String> docIds) {
|
|
|
|
this.entities = entities;
|
2020-04-27 14:52:31 +02:00
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public String getCcId() {
|
|
|
|
return ccId;
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
|
2020-04-27 14:52:31 +02:00
|
|
|
public void setCcId(String ccId) {
|
|
|
|
this.ccId = ccId;
|
|
|
|
}
|
2019-12-06 13:38:00 +01:00
|
|
|
}
|