Compare commits

...

1 Commits

Author SHA1 Message Date
Claudio Atzori 1135f19ec6 WIP: pid cleaning 2023-06-06 12:12:23 +02:00
9 changed files with 93 additions and 30 deletions

View File

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId> <groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
<packaging>jar</packaging> <packaging>jar</packaging>
<version>3.16.1-SNAPSHOT</version> <version>3.17.1-SNAPSHOT</version>
<licenses> <licenses>
<license> <license>

View File

@ -21,7 +21,7 @@ public class Instance implements Serializable {
private KeyValue collectedfrom; private KeyValue collectedfrom;
private List<StructuredProperty> pid; private List<Pid> pid;
private List<StructuredProperty> alternateIdentifier; private List<StructuredProperty> alternateIdentifier;
@ -95,11 +95,11 @@ public class Instance implements Serializable {
this.collectedfrom = collectedfrom; this.collectedfrom = collectedfrom;
} }
public List<StructuredProperty> getPid() { public List<Pid> getPid() {
return pid; return pid;
} }
public void setPid(List<StructuredProperty> pid) { public void setPid(List<Pid> pid) {
this.pid = pid; this.pid = pid;
} }

View File

@ -11,7 +11,7 @@ public abstract class OafEntity extends Oaf implements Serializable {
private List<String> originalId; private List<String> originalId;
private List<StructuredProperty> pid; private List<Pid> pid;
private String dateofcollection; private String dateofcollection;
@ -60,11 +60,11 @@ public abstract class OafEntity extends Oaf implements Serializable {
this.originalId = originalId; this.originalId = originalId;
} }
public List<StructuredProperty> getPid() { public List<Pid> getPid() {
return pid; return pid;
} }
public void setPid(List<StructuredProperty> pid) { public void setPid(List<Pid> pid) {
this.pid = pid; this.pid = pid;
} }

View File

@ -0,0 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
public class Pid extends StructuredProperty implements Serializable {
}

View File

@ -570,7 +570,21 @@ public class Result extends OafEntity implements Serializable {
private static String extractKeyFromPid(final StructuredProperty pid) { private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null) if (pid == null)
return null; return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid); final Pid normalizedPid = CleaningFunctions.normalizePidValue((Pid) pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
}
/**
* Normalize pid string.
*
* @param pid the pid
* @return the string
*/
private static String extractKeyFromAltId(final StructuredProperty pid) {
if (pid == null)
return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizeSPValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue()); return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
} }
@ -595,8 +609,6 @@ public class Result extends OafEntity implements Serializable {
* @return the result map * @return the result map
*/ */
public static Map<String, Instance> toInstanceMap(final List<Instance> ri) { public static Map<String, Instance> toInstanceMap(final List<Instance> ri) {
return ri return ri
.stream() .stream()
.filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null) .filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null)
@ -622,7 +634,7 @@ public class Result extends OafEntity implements Serializable {
* @param enrichments the List of enrichment instances having the same pid * @param enrichments the List of enrichment instances having the same pid
* @return the list * @return the list
*/ */
private static List<Instance> findEnrichmentsByPID(final List<StructuredProperty> pids, final Map<String,Instance> enrichments) { private static List<Instance> findEnrichmentsByPID(final List<Pid> pids, final Map<String,Instance> enrichments) {
if (pids == null || enrichments == null) if (pids == null || enrichments == null)
return null; return null;
return pids return pids
@ -633,6 +645,25 @@ public class Result extends OafEntity implements Serializable {
.collect(Collectors.toList()); .collect(Collectors.toList());
} }
/**
* This utility method finds the list of enrichment instances
* that match one or more PIDs in the input list
*
* @param pids the list of PIDs
* @param enrichments the List of enrichment instances having the same pid
* @return the list
*/
private static List<Instance> findEnrichmentsByAlternateIdentifier(final List<StructuredProperty> pids, final Map<String,Instance> enrichments) {
if (pids == null || enrichments == null)
return null;
return pids
.stream()
.map(Result::extractKeyFromAltId)
.map(enrichments::get)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
/** /**
* This method apply enrichment on a single instance * This method apply enrichment on a single instance
* The enrichment consists of replacing values on * The enrichment consists of replacing values on
@ -712,11 +743,11 @@ public class Result extends OafEntity implements Serializable {
toEnrichInstances.forEach(i -> { toEnrichInstances.forEach(i -> {
final List<Instance> e = findEnrichmentsByPID(i.getPid(), ri); final List<Instance> e = findEnrichmentsByPID(i.getPid(), ri);
if (e!= null && e.size()> 0) { if (e!= null && !e.isEmpty()) {
e.forEach(enr -> applyEnrichment(i, enr)); e.forEach(enr -> applyEnrichment(i, enr));
} else { } else {
final List<Instance> a = findEnrichmentsByPID(i.getAlternateIdentifier(), ri); final List<Instance> a = findEnrichmentsByAlternateIdentifier(i.getAlternateIdentifier(), ri);
if (a!= null && a.size()> 0) { if (a!= null && !a.isEmpty()) {
a.forEach(enr -> applyEnrichment(i, enr)); a.forEach(enr -> applyEnrichment(i, enr));
} }
} }

View File

@ -8,6 +8,7 @@ import java.util.Set;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Pid;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningFunctions { public class CleaningFunctions {
@ -29,7 +30,7 @@ public class CleaningFunctions {
* @param s the PID whose value will be checked. * @param s the PID whose value will be checked.
* @return false if the pid matches the filter criteria, true otherwise. * @return false if the pid matches the filter criteria, true otherwise.
*/ */
public static boolean pidFilter(StructuredProperty s) { public static boolean pidFilter(Pid s) {
final String pidValue = s.getValue(); final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) || if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) || StringUtils.isBlank(pidValue) ||
@ -47,7 +48,7 @@ public class CleaningFunctions {
* @param pid the PID whose value will be normalised. * @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value. * @return the PID containing the normalised value.
*/ */
public static StructuredProperty normalizePidValue(StructuredProperty pid) { public static Pid normalizePidValue(Pid pid) {
pid.setValue( pid.setValue(
normalizePidValue( normalizePidValue(
pid.getQualifier().getClassid(), pid.getQualifier().getClassid(),
@ -56,6 +57,20 @@ public class CleaningFunctions {
return pid; return pid;
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizeSPValue(StructuredProperty pid) {
pid.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) { public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional String value = Optional
.ofNullable(pidValue) .ofNullable(pidValue)

View File

@ -95,7 +95,7 @@ public class IdentifierFactory implements Serializable {
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
} }
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) { public static List<StructuredProperty> getPids(List<Pid> pid, KeyValue collectedFrom) {
return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList()); return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList());
} }
@ -143,7 +143,7 @@ public class IdentifierFactory implements Serializable {
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier"); checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
final Map<String, Set<StructuredProperty>> pids = extractPids(entity); final Map<String, Set<Pid>> pids = extractPids(entity);
return pids return pids
.values() .values()
@ -164,7 +164,7 @@ public class IdentifierFactory implements Serializable {
.orElseGet(entity::getId); .orElseGet(entity::getId);
} }
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) { private static <T extends OafEntity> Map<String, Set<Pid>> extractPids(T entity) {
if (entity instanceof Result) { if (entity instanceof Result) {
return Optional return Optional
.ofNullable(((Result) entity).getInstance()) .ofNullable(((Result) entity).getInstance())
@ -184,7 +184,7 @@ public class IdentifierFactory implements Serializable {
} }
} }
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) { private static Map<String, Set<Pid>> mapPids(List<Instance> instance) {
return instance return instance
.stream() .stream()
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false)) .map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false))
@ -196,7 +196,7 @@ public class IdentifierFactory implements Serializable {
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new)))); Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
} }
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom, private static Stream<Pid> pidFromInstance(List<Pid> pid, KeyValue collectedFrom,
boolean mapHandles) { boolean mapHandles) {
return Optional return Optional
.ofNullable(pid) .ofNullable(pid)

View File

@ -4,12 +4,13 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator; import java.util.Comparator;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.Pid;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidValueComparator implements Comparator<StructuredProperty> { public class PidValueComparator implements Comparator<Pid> {
@Override @Override
public int compare(StructuredProperty left, StructuredProperty right) { public int compare(Pid left, Pid right) {
if (left == null && right == null) if (left == null && right == null)
return 0; return 0;
@ -18,15 +19,15 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null) if (right == null)
return -1; return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left); Pid l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right); Pid r = CleaningFunctions.normalizePidValue(right);
return Optional return Optional
.ofNullable(l.getValue()) .ofNullable(l.getValue())
.map( .map(
lv -> Optional lv -> Optional
.ofNullable(r.getValue()) .ofNullable(r.getValue())
.map(rv -> lv.compareTo(rv)) .map(lv::compareTo)
.orElse(-1)) .orElse(-1))
.orElse(1); .orElse(1);
} }

View File

@ -109,10 +109,20 @@ class MergeTest {
final Result currentPub = source.get(i); final Result currentPub = source.get(i);
final Result currentEnrichment = enrichment.get(i); final Result currentEnrichment = enrichment.get(i);
final Instance currentInstance = Objects.requireNonNull(currentPub.getInstance()).get(0); final Instance currentInstance = Objects.requireNonNull(currentPub.getInstance()).get(0);
if (overrideAlternateIdentifier) final List<Pid> pid = Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid();
currentInstance.setAlternateIdentifier(Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid()); if (overrideAlternateIdentifier) {
else currentInstance.setAlternateIdentifier(pid.stream()
currentInstance.setPid(Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid()); .map(p -> {
StructuredProperty sp = new StructuredProperty();
sp.setValue(p.getValue());
sp.setQualifier(p.getQualifier());
sp.setDataInfo(p.getDataInfo());
return sp;
})
.collect(Collectors.toList()));
} else {
currentInstance.setPid(pid);
}
} }
} }