1
0
Fork 0

implemented default merge procedure applied to result.instance

This commit is contained in:
Claudio Atzori 2024-03-25 15:39:14 +01:00
parent c8683eb13c
commit 9fc70a9451
15 changed files with 243 additions and 103 deletions

View File

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang3.StringUtils;
import java.util.HashSet; import java.util.HashSet;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)"; public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
@ -21,7 +22,8 @@ public class CleaningFunctions {
PID_BLACKLIST.add("na"); PID_BLACKLIST.add("na");
} }
public CleaningFunctions() {} public CleaningFunctions() {
}
/** /**
* Utility method that filter PID values on a per-type basis. * Utility method that filter PID values on a per-type basis.
@ -47,7 +49,8 @@ public class CleaningFunctions {
* @return the PID containing the normalised value. * @return the PID containing the normalised value.
*/ */
public static StructuredProperty normalizePidValue(StructuredProperty pid) { public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid.setValue( pid
.setValue(
normalizePidValue( normalizePidValue(
pid.getQualifier().getClassid(), pid.getQualifier().getClassid(),
pid.getValue())); pid.getValue()));

View File

@ -1,12 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import com.google.common.collect.HashBiMap; import static com.google.common.base.Preconditions.checkArgument;
import com.google.common.collect.Maps; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import java.io.Serializable; import java.io.Serializable;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
@ -16,8 +12,14 @@ import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import static com.google.common.base.Preconditions.checkArgument; import org.apache.commons.codec.binary.Hex;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
/** /**
* Factory class for OpenAIRE identifiers in the Graph * Factory class for OpenAIRE identifiers in the Graph
@ -87,7 +89,8 @@ public class IdentifierFactory implements Serializable {
} }
public static Set<String> delegatedAuthorityDatasourceIds() { public static Set<String> delegatedAuthorityDatasourceIds() {
return DELEGATED_PID_AUTHORITY.values() return DELEGATED_PID_AUTHORITY
.values()
.stream() .stream()
.flatMap(m -> m.keySet().stream()) .flatMap(m -> m.keySet().stream())
.collect(Collectors.toCollection(HashSet::new)); .collect(Collectors.toCollection(HashSet::new));
@ -210,7 +213,6 @@ public class IdentifierFactory implements Serializable {
.orElse(Stream.empty()); .orElse(Stream.empty());
} }
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) { private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid()); final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
@ -220,13 +222,15 @@ public class IdentifierFactory implements Serializable {
boolean isEnrich = Optional boolean isEnrich = Optional
.ofNullable(ENRICHMENT_PROVIDER.get(pType)) .ofNullable(ENRICHMENT_PROVIDER.get(pType))
.map(enrich -> enrich.containsKey(collectedFrom.getKey()) .map(
enrich -> enrich.containsKey(collectedFrom.getKey())
|| enrich.containsValue(collectedFrom.getValue())) || enrich.containsValue(collectedFrom.getValue()))
.orElse(false); .orElse(false);
boolean isAuthority = Optional boolean isAuthority = Optional
.ofNullable(PID_AUTHORITY.get(pType)) .ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> authorities.containsKey(collectedFrom.getKey()) .map(
authorities -> authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue())) || authorities.containsValue(collectedFrom.getValue()))
.orElse(false); .orElse(false);

View File

@ -1,22 +1,24 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
import java.text.ParseException; import java.text.ParseException;
import java.time.ZoneId;
import java.util.*; import java.util.*;
import java.util.function.BinaryOperator; import java.util.function.BinaryOperator;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import com.google.common.base.Joiner;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair; import org.apache.commons.lang3.tuple.Pair;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.schema.common.AccessRightComparator; import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -173,7 +175,8 @@ public class MergeUtils {
return a || b; return a || b;
} }
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust, Function<T, K> keyExtractor, BinaryOperator<T> merger) { private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
if (left == null) { if (left == null) {
return right; return right;
} else if (right == null) { } else if (right == null) {
@ -402,11 +405,12 @@ public class MergeUtils {
// instance enrichment or union // instance enrichment or union
// review instance equals => add pid to comparision // review instance equals => add pid to comparision
if (!isAnEnrichment(merge) && !isAnEnrichment(enrich)) if (!isAnEnrichment(merge) && !isAnEnrichment(enrich))
merge.setInstance( merge
mergeLists(merge.getInstance(), enrich.getInstance(), trust, .setInstance(
mergeLists(
merge.getInstance(), enrich.getInstance(), trust,
MergeUtils::instanceKeyExtractor, MergeUtils::instanceKeyExtractor,
MergeUtils::instanceMerger MergeUtils::instanceMerger));
));
else { else {
final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance() final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance()
: enrich.getInstance(); : enrich.getInstance();
@ -428,12 +432,103 @@ public class MergeUtils {
} }
private static String instanceKeyExtractor(Instance i) { private static String instanceKeyExtractor(Instance i) {
return String.join("::", return String
.join(
"::",
kvKeyExtractor(i.getHostedby()), kvKeyExtractor(i.getHostedby()),
kvKeyExtractor(i.getCollectedfrom()),
qualifierKeyExtractor(i.getAccessright()), qualifierKeyExtractor(i.getAccessright()),
qualifierKeyExtractor(i.getInstancetype()), qualifierKeyExtractor(i.getInstancetype()),
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null), Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
Optional.ofNullable(i.getPid()).map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::"))).orElse(null)); Optional
.ofNullable(i.getPid())
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
.orElse(null));
}
private static Instance instanceMerger(Instance i1, Instance i2) {
Instance i = new Instance();
i.setHostedby(i1.getHostedby());
i.setCollectedfrom(i1.getCollectedfrom());
i.setAccessright(i1.getAccessright());
i.setInstancetype(i1.getInstancetype());
i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1));
i
.setAlternateIdentifier(
mergeLists(
i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor,
(sp1, sp2) -> sp1));
i
.setRefereed(
Collections
.min(
Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()),
new RefereedComparator()));
i
.setInstanceTypeMapping(
mergeLists(
i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0,
MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext()));
i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance()));
i.setLicense(firstNonNull(i1.getLicense(), i2.getLicense()));
i.setProcessingchargeamount(firstNonNull(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
i.setProcessingchargecurrency(firstNonNull(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
i
.setMeasures(
mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl()));
return i;
}
private static String measureKeyExtractor(Measure m) {
return String
.join(
"::",
m.getId(),
m
.getUnit()
.stream()
.map(KeyValue::getKey)
.collect(Collectors.joining("::")));
}
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
return Stream
.of(d1, d2)
.filter(Objects::nonNull)
.min(
Comparator
.comparing(
f -> DateParserUtils
.parseDate(f.getValue())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate()))
.orElse(d1);
}
private static String selectFulltext(String ft1, String ft2) {
if (StringUtils.endsWith(ft1, "pdf")) {
return ft1;
}
if (StringUtils.endsWith(ft2, "pdf")) {
return ft2;
}
return firstNonNull(ft1, ft2);
}
private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) {
return String
.join(
"::",
itm.getOriginalType(),
itm.getTypeCode(),
itm.getTypeLabel(),
itm.getVocabularyName());
} }
private static String kvKeyExtractor(KeyValue kv) { private static String kvKeyExtractor(KeyValue kv) {
@ -444,22 +539,17 @@ public class MergeUtils {
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null); return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
} }
private static <T> T FieldKeyExtractor(Field<T> f) { private static <T> T fieldKeyExtractor(Field<T> f) {
return Optional.ofNullable(f).map(Field::getValue).orElse(null); return Optional.ofNullable(f).map(Field::getValue).orElse(null);
} }
private static String spKeyExtractor(StructuredProperty sp) { private static String spKeyExtractor(StructuredProperty sp) {
return Optional.ofNullable(sp).map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier()))).orElse(null); return Optional
.ofNullable(sp)
.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
.orElse(null);
} }
private static Instance instanceMerger(Instance i1, Instance i2) {
// TODO implement me!
return i1;
}
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) { private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
int trust = compareTrust(original, enrich); int trust = compareTrust(original, enrich);
final T merge = mergeResult(original, enrich); final T merge = mergeResult(original, enrich);

View File

@ -3,7 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
public class ModelHardLimits { public class ModelHardLimits {
private ModelHardLimits() {} private ModelHardLimits() {
}
public static final String LAYOUT = "index"; public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire"; public static final String INTERPRETATION = "openaire";

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator; import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OrganizationPidComparator implements Comparator<StructuredProperty> { public class OrganizationPidComparator implements Comparator<StructuredProperty> {
@Override @Override

View File

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import java.io.IOException; import java.io.IOException;
import java.util.HashSet; import java.util.HashSet;
import java.util.Optional; import java.util.Optional;
import java.util.Set; import java.util.Set;
import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PidBlacklistProvider { public class PidBlacklistProvider {
private static final PidBlacklist blacklist; private static final PidBlacklist blacklist;
@ -33,6 +34,7 @@ public class PidBlacklistProvider {
.orElse(new HashSet<>()); .orElse(new HashSet<>());
} }
private PidBlacklistProvider() {} private PidBlacklistProvider() {
}
} }

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> { public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private final T entity; private final T entity;

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator; import java.util.Comparator;
import java.util.Optional; import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidValueComparator implements Comparator<StructuredProperty> { public class PidValueComparator implements Comparator<StructuredProperty> {
@Override @Override

View File

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
/**
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
*
* peerReviewed (0001) > nonPeerReviewed (0002) > UNKNOWN (0000)
*/
public class RefereedComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
String lClass = left.getClassid();
String rClass = right.getClassid();
if ("0001".equals(lClass))
return -1;
if ("0001".equals(rClass))
return 1;
if ("0002".equals(lClass))
return -1;
if ("0002".equals(rClass))
return 1;
if ("0000".equals(lClass))
return -1;
if ("0000".equals(rClass))
return 1;
return 0;
}
}

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator; import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultPidComparator implements Comparator<StructuredProperty> { public class ResultPidComparator implements Comparator<StructuredProperty> {
@Override @Override

View File

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.common.ModelConstants; import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashSet; import java.util.HashSet;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
public class ResultTypeComparator implements Comparator<Result> { public class ResultTypeComparator implements Comparator<Result> {

View File

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Set;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import java.util.Set;
class BlackListProviderTest { class BlackListProviderTest {
@Test @Test

View File

@ -1,16 +1,18 @@
package eu.dnetlib.dhp.schema.oaf.utils; package eu.dnetlib.dhp.schema.oaf.utils;
import com.fasterxml.jackson.databind.DeserializationFeature; import static org.junit.jupiter.api.Assertions.assertEquals;
import com.fasterxml.jackson.databind.ObjectMapper; import static org.junit.jupiter.api.Assertions.assertNotNull;
import eu.dnetlib.dhp.schema.oaf.Publication;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException; import java.io.IOException;
import static org.junit.jupiter.api.Assertions.assertEquals; import org.apache.commons.io.IOUtils;
import static org.junit.jupiter.api.Assertions.assertNotNull; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
class IdentifierFactoryTest { class IdentifierFactoryTest {

View File

@ -10,24 +10,23 @@ import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.beanutils.BeanUtils; import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature; import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class MergeUtilsTest { public class MergeUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper() private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test @Test
void testMergePubs_new() throws IOException { void testMergePubs_new() throws IOException {
Publication pt = read("publication_test.json", Publication.class); Publication pt = read("publication_test.json", Publication.class);