forked from antonis.lempesis/dnet-hadoop
[cleaning] country cleaning must use both PID and AlternateIdentifier fields
This commit is contained in:
parent
b8bafab8a0
commit
7b80b24f82
|
@ -4,9 +4,12 @@ package eu.dnetlib.dhp.oa.graph.clean.country;
|
|||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import javax.swing.text.html.Option;
|
||||
|
||||
|
@ -30,6 +33,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
|||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleanContextSparkJob;
|
||||
import eu.dnetlib.dhp.schema.oaf.Country;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
@ -110,8 +114,8 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
return r;
|
||||
}
|
||||
|
||||
if (r
|
||||
.getPid()
|
||||
List<StructuredProperty> ids = getPidsAndAltIds(r).collect(Collectors.toList());
|
||||
if (ids
|
||||
.stream()
|
||||
.anyMatch(
|
||||
p -> p
|
||||
|
@ -148,6 +152,42 @@ public class CleanCountrySparkJob implements Serializable {
|
|||
.json(inputPath);
|
||||
}
|
||||
|
||||
private static <T extends Result> Stream<StructuredProperty> getPidsAndAltIds(T r) {
|
||||
final Stream<StructuredProperty> resultPids = Optional
|
||||
.ofNullable(r.getPid())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty());
|
||||
|
||||
final Stream<StructuredProperty> instancePids = Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.flatMap(
|
||||
i -> Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty())))
|
||||
.orElse(Stream.empty());
|
||||
|
||||
final Stream<StructuredProperty> instanceAltIds = Optional
|
||||
.ofNullable(r.getInstance())
|
||||
.map(
|
||||
instance -> instance
|
||||
.stream()
|
||||
.flatMap(
|
||||
i -> Optional
|
||||
.ofNullable(i.getAlternateIdentifier())
|
||||
.map(Collection::stream)
|
||||
.orElse(Stream.empty())))
|
||||
.orElse(Stream.empty());
|
||||
|
||||
return Stream
|
||||
.concat(
|
||||
Stream.concat(resultPids, instancePids),
|
||||
instanceAltIds);
|
||||
}
|
||||
|
||||
private static boolean pidInParam(String value, String[] verifyParam) {
|
||||
for (String s : verifyParam)
|
||||
if (value.startsWith(s))
|
||||
|
|
Loading…
Reference in New Issue