2020-07-22 17:29:48 +02:00
|
|
|
|
2020-10-06 16:44:51 +02:00
|
|
|
package eu.dnetlib.dhp.oa.dedup.model;
|
2020-07-22 17:29:48 +02:00
|
|
|
|
|
|
|
import java.io.Serializable;
|
2020-11-04 15:02:02 +01:00
|
|
|
import java.text.ParseException;
|
|
|
|
import java.text.SimpleDateFormat;
|
2020-11-26 11:03:12 +01:00
|
|
|
import java.util.*;
|
2020-10-06 16:21:34 +02:00
|
|
|
import java.util.stream.Collectors;
|
2020-07-22 17:29:48 +02:00
|
|
|
|
2020-11-26 13:08:36 +01:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
2020-11-04 15:02:02 +01:00
|
|
|
import org.apache.commons.lang3.StringUtils;
|
|
|
|
|
2020-10-20 12:19:46 +02:00
|
|
|
import com.google.common.collect.Sets;
|
2020-10-30 15:47:05 +01:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
2020-09-29 15:31:46 +02:00
|
|
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
2020-11-04 15:02:02 +01:00
|
|
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.*;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
|
|
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
2020-09-29 15:31:46 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
|
2020-09-29 15:31:46 +02:00
|
|
|
|
2020-11-26 11:03:12 +01:00
|
|
|
public static final String DATE_FORMAT = "yyyy-MM-dd";
|
2020-11-26 13:08:36 +01:00
|
|
|
public static final String BASE_DATE = "2000-01-01";
|
2020-09-29 15:31:46 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private T entity;
|
2020-09-29 15:31:46 +02:00
|
|
|
|
2020-11-26 11:03:12 +01:00
|
|
|
// cached date value
|
|
|
|
private Date date = null;
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public static <T extends OafEntity> Identifier newInstance(T entity) {
|
|
|
|
return new Identifier(entity);
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public Identifier(T entity) {
|
|
|
|
this.entity = entity;
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public T getEntity() {
|
|
|
|
return entity;
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public void setEntity(T entity) {
|
|
|
|
this.entity = entity;
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public Date getDate() {
|
2020-11-26 11:03:12 +01:00
|
|
|
if (Objects.nonNull(date)) {
|
|
|
|
return date;
|
|
|
|
} else {
|
|
|
|
String sDate = BASE_DATE;
|
|
|
|
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
|
|
|
Result result = (Result) getEntity();
|
|
|
|
if (isWellformed(result.getDateofacceptance())) {
|
|
|
|
sDate = result.getDateofacceptance().getValue();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
this.date = new SimpleDateFormat(DATE_FORMAT).parse(sDate);
|
|
|
|
return date;
|
|
|
|
} catch (Throwable e) {
|
|
|
|
throw new RuntimeException(
|
|
|
|
String.format("cannot parse date: '%s' from record: '%s'", sDate, entity.getId()));
|
2020-11-04 15:02:02 +01:00
|
|
|
}
|
|
|
|
}
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private static boolean isWellformed(Field<String> date) {
|
|
|
|
return date != null && StringUtils.isNotBlank(date.getValue())
|
|
|
|
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public List<KeyValue> getCollectedFrom() {
|
|
|
|
return entity.getCollectedfrom();
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public EntityType getEntityType() {
|
|
|
|
return EntityType.fromClass(entity.getClass());
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
public String getOriginalID() {
|
|
|
|
return entity.getId();
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private PidType getPidType() {
|
|
|
|
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
@Override
|
|
|
|
public int compareTo(Identifier i) {
|
|
|
|
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
|
|
|
// alphabetical order of the originalID
|
2020-10-06 16:21:34 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
Set<String> lKeys = Optional
|
|
|
|
.ofNullable(getCollectedFrom())
|
|
|
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
|
|
|
.orElse(Sets.newHashSet());
|
2020-10-20 12:19:46 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
|
|
|
Set<String> rKeys = cf
|
|
|
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
|
|
|
.orElse(Sets.newHashSet());
|
2020-10-06 16:21:34 +02:00
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
|
|
|
if (getEntityType() == EntityType.publication) {
|
2020-11-26 13:08:36 +01:00
|
|
|
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
|
|
|
|
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
|
2020-09-29 15:31:46 +02:00
|
|
|
return -1;
|
2020-11-26 13:08:36 +01:00
|
|
|
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
|
|
|
|
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
|
2020-09-29 15:31:46 +02:00
|
|
|
return 1;
|
2020-11-04 15:02:02 +01:00
|
|
|
}
|
|
|
|
if (getEntityType() == EntityType.dataset) {
|
2020-11-26 13:08:36 +01:00
|
|
|
if (isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID)
|
|
|
|
&& !isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID))
|
2020-09-29 15:31:46 +02:00
|
|
|
return -1;
|
2020-11-26 13:08:36 +01:00
|
|
|
if (isFromDatasourceID(rKeys, ModelConstants.CROSSREF_ID)
|
|
|
|
&& !isFromDatasourceID(lKeys, ModelConstants.CROSSREF_ID))
|
2020-11-04 15:02:02 +01:00
|
|
|
return 1;
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
2020-10-06 16:21:34 +02:00
|
|
|
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
2020-11-26 11:03:12 +01:00
|
|
|
// we need to take the alphabetically lower id
|
2020-11-04 15:02:02 +01:00
|
|
|
return this.getOriginalID().compareTo(i.getOriginalID());
|
2020-09-29 15:31:46 +02:00
|
|
|
} else
|
2020-11-26 11:03:12 +01:00
|
|
|
// we need to take the elder date
|
2020-11-04 15:02:02 +01:00
|
|
|
return this.getDate().compareTo(i.getDate());
|
2020-09-29 15:31:46 +02:00
|
|
|
} else {
|
2020-11-04 15:02:02 +01:00
|
|
|
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2020-11-04 15:02:02 +01:00
|
|
|
private StructuredProperty toSP(PidType pidType) {
|
|
|
|
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
|
|
|
}
|
|
|
|
|
2020-10-06 16:21:34 +02:00
|
|
|
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
|
|
|
return collectedFrom.contains(dsId);
|
2020-09-29 15:31:46 +02:00
|
|
|
}
|
2020-07-22 17:29:48 +02:00
|
|
|
}
|