implemented default merge procedure applied to result.instance

mergeutils
Claudio Atzori 1 month ago
parent c8683eb13c
commit 9fc70a9451

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import org.apache.commons.lang3.StringUtils;
import java.util.HashSet;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class CleaningFunctions {
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
@ -21,7 +22,8 @@ public class CleaningFunctions {
PID_BLACKLIST.add("na");
}
public CleaningFunctions() {}
public CleaningFunctions() {
}
/**
* Utility method that filter PID values on a per-type basis.
@ -47,7 +49,8 @@ public class CleaningFunctions {
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
pid.setValue(
pid
.setValue(
normalizePidValue(
pid.getQualifier().getClassid(),
pid.getValue()));
@ -57,9 +60,9 @@ public class CleaningFunctions {
public static String normalizePidValue(String pidType, String pidValue) {
String value = Optional
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
.ofNullable(pidValue)
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pidType) {

@ -1,12 +1,8 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import static com.google.common.base.Preconditions.checkArgument;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.io.Serializable;
import java.nio.charset.StandardCharsets;
@ -16,8 +12,14 @@ import java.util.function.Function;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import static com.google.common.base.Preconditions.checkArgument;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import org.apache.commons.codec.binary.Hex;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.HashBiMap;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
/**
* Factory class for OpenAIRE identifiers in the Graph
@ -87,10 +89,11 @@ public class IdentifierFactory implements Serializable {
}
public static Set<String> delegatedAuthorityDatasourceIds() {
return DELEGATED_PID_AUTHORITY.values()
.stream()
.flatMap(m -> m.keySet().stream())
.collect(Collectors.toCollection(HashSet::new));
return DELEGATED_PID_AUTHORITY
.values()
.stream()
.flatMap(m -> m.keySet().stream())
.collect(Collectors.toCollection(HashSet::new));
}
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
@ -210,7 +213,6 @@ public class IdentifierFactory implements Serializable {
.orElse(Stream.empty());
}
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
@ -219,16 +221,18 @@ public class IdentifierFactory implements Serializable {
}
boolean isEnrich = Optional
.ofNullable(ENRICHMENT_PROVIDER.get(pType))
.map(enrich -> enrich.containsKey(collectedFrom.getKey())
|| enrich.containsValue(collectedFrom.getValue()))
.orElse(false);
.ofNullable(ENRICHMENT_PROVIDER.get(pType))
.map(
enrich -> enrich.containsKey(collectedFrom.getKey())
|| enrich.containsValue(collectedFrom.getValue()))
.orElse(false);
boolean isAuthority = Optional
.ofNullable(PID_AUTHORITY.get(pType))
.map(authorities -> authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue()))
.orElse(false);
.ofNullable(PID_AUTHORITY.get(pType))
.map(
authorities -> authorities.containsKey(collectedFrom.getKey())
|| authorities.containsValue(collectedFrom.getValue()))
.orElse(false);
return (mapHandles && pType.equals(PidType.handle)) || isEnrich || isAuthority;
}
@ -260,12 +264,12 @@ public class IdentifierFactory implements Serializable {
public static String idFromPid(String numericPrefix, String pidType, String pidValue, boolean md5) {
return new StringBuilder()
.append(numericPrefix)
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(pidType))
.append(ID_SEPARATOR)
.append(md5 ? md5(pidValue) : pidValue)
.toString();
.append(numericPrefix)
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(pidType))
.append(ID_SEPARATOR)
.append(md5 ? md5(pidValue) : pidValue)
.toString();
}
// create the prefix (length = 12)

@ -1,22 +1,24 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
import static com.google.common.base.Preconditions.checkArgument;
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
import java.text.ParseException;
import java.time.ZoneId;
import java.util.*;
import java.util.function.BinaryOperator;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import com.google.common.base.Joiner;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.apache.commons.lang3.tuple.Pair;
import com.github.sisyphsu.dateparser.DateParserUtils;
import com.google.common.base.Joiner;
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -173,7 +175,8 @@ public class MergeUtils {
return a || b;
}
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust, Function<T, K> keyExtractor, BinaryOperator<T> merger) {
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
if (left == null) {
return right;
} else if (right == null) {
@ -184,11 +187,11 @@ public class MergeUtils {
List<T> l = trust >= 0 ? right : left;
return new ArrayList<>(Stream
.concat(h.stream(), l.stream())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toMap(keyExtractor, v -> v, merger))
.values());
.concat(h.stream(), l.stream())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toMap(keyExtractor, v -> v, merger))
.values());
}
private static <T, K> List<T> unionDistinctLists(final List<T> left, final List<T> right, int trust) {
@ -202,10 +205,10 @@ public class MergeUtils {
List<T> l = trust >= 0 ? right : left;
return Stream
.concat(h.stream(), l.stream())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
.concat(h.stream(), l.stream())
.filter(Objects::nonNull)
.distinct()
.collect(Collectors.toList());
}
private static List<String> unionDistinctListOfString(final List<String> l, final List<String> r) {
@ -402,11 +405,12 @@ public class MergeUtils {
// instance enrichment or union
// review instance equals => add pid to comparision
if (!isAnEnrichment(merge) && !isAnEnrichment(enrich))
merge.setInstance(
mergeLists(merge.getInstance(), enrich.getInstance(), trust,
MergeUtils::instanceKeyExtractor,
MergeUtils::instanceMerger
));
merge
.setInstance(
mergeLists(
merge.getInstance(), enrich.getInstance(), trust,
MergeUtils::instanceKeyExtractor,
MergeUtils::instanceMerger));
else {
final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance()
: enrich.getInstance();
@ -428,12 +432,103 @@ public class MergeUtils {
}
private static String instanceKeyExtractor(Instance i) {
return String.join("::",
kvKeyExtractor(i.getHostedby()),
qualifierKeyExtractor(i.getAccessright()),
qualifierKeyExtractor(i.getInstancetype()),
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
Optional.ofNullable(i.getPid()).map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::"))).orElse(null));
return String
.join(
"::",
kvKeyExtractor(i.getHostedby()),
kvKeyExtractor(i.getCollectedfrom()),
qualifierKeyExtractor(i.getAccessright()),
qualifierKeyExtractor(i.getInstancetype()),
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
Optional
.ofNullable(i.getPid())
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
.orElse(null));
}
private static Instance instanceMerger(Instance i1, Instance i2) {
Instance i = new Instance();
i.setHostedby(i1.getHostedby());
i.setCollectedfrom(i1.getCollectedfrom());
i.setAccessright(i1.getAccessright());
i.setInstancetype(i1.getInstancetype());
i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1));
i
.setAlternateIdentifier(
mergeLists(
i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor,
(sp1, sp2) -> sp1));
i
.setRefereed(
Collections
.min(
Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()),
new RefereedComparator()));
i
.setInstanceTypeMapping(
mergeLists(
i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0,
MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext()));
i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance()));
i.setLicense(firstNonNull(i1.getLicense(), i2.getLicense()));
i.setProcessingchargeamount(firstNonNull(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
i.setProcessingchargecurrency(firstNonNull(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
i
.setMeasures(
mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl()));
return i;
}
private static String measureKeyExtractor(Measure m) {
return String
.join(
"::",
m.getId(),
m
.getUnit()
.stream()
.map(KeyValue::getKey)
.collect(Collectors.joining("::")));
}
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
return Stream
.of(d1, d2)
.filter(Objects::nonNull)
.min(
Comparator
.comparing(
f -> DateParserUtils
.parseDate(f.getValue())
.toInstant()
.atZone(ZoneId.systemDefault())
.toLocalDate()))
.orElse(d1);
}
private static String selectFulltext(String ft1, String ft2) {
if (StringUtils.endsWith(ft1, "pdf")) {
return ft1;
}
if (StringUtils.endsWith(ft2, "pdf")) {
return ft2;
}
return firstNonNull(ft1, ft2);
}
private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) {
return String
.join(
"::",
itm.getOriginalType(),
itm.getTypeCode(),
itm.getTypeLabel(),
itm.getVocabularyName());
}
private static String kvKeyExtractor(KeyValue kv) {
@ -444,22 +539,17 @@ public class MergeUtils {
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
}
private static <T> T FieldKeyExtractor(Field<T> f) {
private static <T> T fieldKeyExtractor(Field<T> f) {
return Optional.ofNullable(f).map(Field::getValue).orElse(null);
}
private static String spKeyExtractor(StructuredProperty sp) {
return Optional.ofNullable(sp).map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier()))).orElse(null);
}
private static Instance instanceMerger(Instance i1, Instance i2) {
// TODO implement me!
return i1;
return Optional
.ofNullable(sp)
.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
.orElse(null);
}
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
int trust = compareTrust(original, enrich);
final T merge = mergeResult(original, enrich);

@ -3,7 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
public class ModelHardLimits {
private ModelHardLimits() {}
private ModelHardLimits() {
}
public static final String LAYOUT = "index";
public static final String INTERPRETATION = "openaire";

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
@Override

@ -1,14 +1,15 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOUtils;
import java.io.IOException;
import java.util.HashSet;
import java.util.Optional;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import com.fasterxml.jackson.databind.ObjectMapper;
public class PidBlacklistProvider {
private static final PidBlacklist blacklist;
@ -33,6 +34,7 @@ public class PidBlacklistProvider {
.orElse(new HashSet<>());
}
private PidBlacklistProvider() {}
private PidBlacklistProvider() {
}
}

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private final T entity;

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
import java.util.Optional;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidValueComparator implements Comparator<StructuredProperty> {
@Override

@ -0,0 +1,39 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
/**
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
*
* peerReviewed (0001) > nonPeerReviewed (0002) > UNKNOWN (0000)
*/
public class RefereedComparator implements Comparator<Qualifier> {
@Override
public int compare(Qualifier left, Qualifier right) {
String lClass = left.getClassid();
String rClass = right.getClassid();
if ("0001".equals(lClass))
return -1;
if ("0001".equals(rClass))
return 1;
if ("0002".equals(lClass))
return -1;
if ("0002".equals(rClass))
return 1;
if ("0000".equals(lClass))
return -1;
if ("0000".equals(rClass))
return 1;
return 0;
}
}

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class ResultPidComparator implements Comparator<StructuredProperty> {
@Override

@ -1,16 +1,16 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import java.util.Comparator;
import java.util.HashSet;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Result;
public class ResultTypeComparator implements Comparator<Result> {

@ -1,11 +1,11 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Set;
import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.Test;
import java.util.Set;
class BlackListProviderTest {
@Test

@ -1,16 +1,18 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import java.io.IOException;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import eu.dnetlib.dhp.schema.oaf.Publication;
class IdentifierFactoryTest {
@ -42,7 +44,7 @@ class IdentifierFactoryTest {
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
verifyIdentifier(
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
"publication_openapc.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID, true);
@ -69,7 +71,7 @@ class IdentifierFactoryTest {
@Test
void testCreateIdentifierForROHub() throws IOException {
verifyIdentifier(
"orp-rohub.json", "50|w3id________::afc7592914ae190a50570db90f55f9c2", true);
"orp-rohub.json", "50|w3id________::afc7592914ae190a50570db90f55f9c2", true);
}
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {

@ -10,24 +10,23 @@ import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
public class MergeUtilsTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
void testMergePubs_new() throws IOException {
Publication pt = read("publication_test.json", Publication.class);

@ -178,10 +178,10 @@ class OafMapperUtilsTest {
assertEquals(
ModelConstants.DATASET_RESULTTYPE_CLASSID,
((Result) MergeUtils
((Result) MergeUtils
.merge(p2, d1))
.getResulttype()
.getClassid());
.getResulttype()
.getClassid());
}
@Test

Loading…
Cancel
Save