Compare commits

...

1 Commits

Author SHA1 Message Date
Claudio Atzori 1135f19ec6 WIP: pid cleaning 11 months ago

@ -5,7 +5,7 @@
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<packaging>jar</packaging>
<version>3.16.1-SNAPSHOT</version>
<version>3.17.1-SNAPSHOT</version>
<licenses>
<license>

@ -21,7 +21,7 @@ public class Instance implements Serializable {
private KeyValue collectedfrom;
private List<StructuredProperty> pid;
private List<Pid> pid;
private List<StructuredProperty> alternateIdentifier;
@ -95,11 +95,11 @@ public class Instance implements Serializable {
this.collectedfrom = collectedfrom;
}
public List<StructuredProperty> getPid() {
public List<Pid> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
public void setPid(List<Pid> pid) {
this.pid = pid;
}

@ -11,7 +11,7 @@ public abstract class OafEntity extends Oaf implements Serializable {
private List<String> originalId;
private List<StructuredProperty> pid;
private List<Pid> pid;
private String dateofcollection;
@ -60,11 +60,11 @@ public abstract class OafEntity extends Oaf implements Serializable {
this.originalId = originalId;
}
public List<StructuredProperty> getPid() {
public List<Pid> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
public void setPid(List<Pid> pid) {
this.pid = pid;
}

@ -0,0 +1,6 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
public class Pid extends StructuredProperty implements Serializable {
}

@ -570,7 +570,21 @@ public class Result extends OafEntity implements Serializable {
private static String extractKeyFromPid(final StructuredProperty pid) {
if (pid == null)
return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizePidValue(pid);
final Pid normalizedPid = CleaningFunctions.normalizePidValue((Pid) pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
}
/**
* Normalize pid string.
*
* @param pid the pid
* @return the string
*/
private static String extractKeyFromAltId(final StructuredProperty pid) {
if (pid == null)
return null;
final StructuredProperty normalizedPid = CleaningFunctions.normalizeSPValue(pid);
return String.format("%s::%s", normalizedPid.getQualifier().getClassid(), normalizedPid.getValue());
}
@ -595,8 +609,6 @@ public class Result extends OafEntity implements Serializable {
* @return the result map
*/
public static Map<String, Instance> toInstanceMap(final List<Instance> ri) {
return ri
.stream()
.filter(i -> i.getPid() != null || i.getAlternateIdentifier() != null)
@ -622,7 +634,7 @@ public class Result extends OafEntity implements Serializable {
* @param enrichments the List of enrichment instances having the same pid
* @return the list
*/
private static List<Instance> findEnrichmentsByPID(final List<StructuredProperty> pids, final Map<String,Instance> enrichments) {
private static List<Instance> findEnrichmentsByPID(final List<Pid> pids, final Map<String,Instance> enrichments) {
if (pids == null || enrichments == null)
return null;
return pids
@ -633,6 +645,25 @@ public class Result extends OafEntity implements Serializable {
.collect(Collectors.toList());
}
/**
* This utility method finds the list of enrichment instances
* that match one or more PIDs in the input list
*
* @param pids the list of PIDs
* @param enrichments the List of enrichment instances having the same pid
* @return the list
*/
private static List<Instance> findEnrichmentsByAlternateIdentifier(final List<StructuredProperty> pids, final Map<String,Instance> enrichments) {
if (pids == null || enrichments == null)
return null;
return pids
.stream()
.map(Result::extractKeyFromAltId)
.map(enrichments::get)
.filter(Objects::nonNull)
.collect(Collectors.toList());
}
/**
* This method apply enrichment on a single instance
* The enrichment consists of replacing values on
@ -712,11 +743,11 @@ public class Result extends OafEntity implements Serializable {
toEnrichInstances.forEach(i -> {
final List<Instance> e = findEnrichmentsByPID(i.getPid(), ri);
if (e!= null && e.size()> 0) {
if (e!= null && !e.isEmpty()) {
e.forEach(enr -> applyEnrichment(i, enr));
} else {
final List<Instance> a = findEnrichmentsByPID(i.getAlternateIdentifier(), ri);
if (a!= null && a.size()> 0) {
final List<Instance> a = findEnrichmentsByAlternateIdentifier(i.getAlternateIdentifier(), ri);
if (a!= null && !a.isEmpty()) {
a.forEach(enr -> applyEnrichment(i, enr));
}
}

@ -8,6 +8,7 @@ import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.Pid;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningFunctions {
@ -29,7 +30,7 @@ public class CleaningFunctions {
* @param s the PID whose value will be checked.
* @return false if the pid matches the filter criteria, true otherwise.
*/
public static boolean pidFilter(StructuredProperty s) {
public static boolean pidFilter(Pid s) {
final String pidValue = s.getValue();
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(pidValue) ||
@ -47,7 +48,7 @@ public class CleaningFunctions {
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
public static Pid normalizePidValue(Pid pid) {
pid.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
@ -56,6 +57,20 @@ public class CleaningFunctions {
return pid;
}
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizeSPValue(StructuredProperty pid) {
pid.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
return pid;
}
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)

@ -95,7 +95,7 @@ public class IdentifierFactory implements Serializable {
.collect(Collectors.toCollection(HashSet::new));
}
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
public static List<StructuredProperty> getPids(List<Pid> pid, KeyValue collectedFrom) {
return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList());
}
@ -143,7 +143,7 @@ public class IdentifierFactory implements Serializable {
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
final Map<String, Set<StructuredProperty>> pids = extractPids(entity);
final Map<String, Set<Pid>> pids = extractPids(entity);
return pids
.values()
@ -164,7 +164,7 @@ public class IdentifierFactory implements Serializable {
.orElseGet(entity::getId);
}
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) {
private static <T extends OafEntity> Map<String, Set<Pid>> extractPids(T entity) {
if (entity instanceof Result) {
return Optional
.ofNullable(((Result) entity).getInstance())
@ -184,7 +184,7 @@ public class IdentifierFactory implements Serializable {
}
}
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) {
private static Map<String, Set<Pid>> mapPids(List<Instance> instance) {
return instance
.stream()
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false))
@ -196,7 +196,7 @@ public class IdentifierFactory implements Serializable {
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
}
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom,
private static Stream<Pid> pidFromInstance(List<Pid> pid, KeyValue collectedFrom,
boolean mapHandles) {
return Optional
.ofNullable(pid)

@ -4,12 +4,13 @@ package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.Pid;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidValueComparator implements Comparator<StructuredProperty> {
public class PidValueComparator implements Comparator<Pid> {
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
public int compare(Pid left, Pid right) {
if (left == null && right == null)
return 0;
@ -18,15 +19,15 @@ public class PidValueComparator implements Comparator<StructuredProperty> {
if (right == null)
return -1;
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
Pid l = CleaningFunctions.normalizePidValue(left);
Pid r = CleaningFunctions.normalizePidValue(right);
return Optional
.ofNullable(l.getValue())
.map(
lv -> Optional
.ofNullable(r.getValue())
.map(rv -> lv.compareTo(rv))
.map(lv::compareTo)
.orElse(-1))
.orElse(1);
}

@ -109,10 +109,20 @@ class MergeTest {
final Result currentPub = source.get(i);
final Result currentEnrichment = enrichment.get(i);
final Instance currentInstance = Objects.requireNonNull(currentPub.getInstance()).get(0);
if (overrideAlternateIdentifier)
currentInstance.setAlternateIdentifier(Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid());
else
currentInstance.setPid(Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid());
final List<Pid> pid = Objects.requireNonNull(currentEnrichment.getInstance()).get(0).getPid();
if (overrideAlternateIdentifier) {
currentInstance.setAlternateIdentifier(pid.stream()
.map(p -> {
StructuredProperty sp = new StructuredProperty();
sp.setValue(p.getValue());
sp.setQualifier(p.getQualifier());
sp.setDataInfo(p.getDataInfo());
return sp;
})
.collect(Collectors.toList()));
} else {
currentInstance.setPid(pid);
}
}
}

Loading…
Cancel
Save