forked from D-Net/dnet-hadoop
implemented default merge procedure applied to result.instance
This commit is contained in:
parent
c8683eb13c
commit
9fc70a9451
|
@ -1,14 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class CleaningFunctions {
|
||||
|
||||
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10\\.)";
|
||||
|
@ -21,7 +22,8 @@ public class CleaningFunctions {
|
|||
PID_BLACKLIST.add("na");
|
||||
}
|
||||
|
||||
public CleaningFunctions() {}
|
||||
public CleaningFunctions() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Utility method that filter PID values on a per-type basis.
|
||||
|
@ -47,7 +49,8 @@ public class CleaningFunctions {
|
|||
* @return the PID containing the normalised value.
|
||||
*/
|
||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||
pid.setValue(
|
||||
pid
|
||||
.setValue(
|
||||
normalizePidValue(
|
||||
pid.getQualifier().getClassid(),
|
||||
pid.getValue()));
|
||||
|
|
|
@ -1,12 +1,8 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import com.google.common.collect.HashBiMap;
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
@ -16,8 +12,14 @@ import java.util.function.Function;
|
|||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||
import org.apache.commons.codec.binary.Hex;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import com.google.common.collect.HashBiMap;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
/**
|
||||
* Factory class for OpenAIRE identifiers in the Graph
|
||||
|
@ -87,7 +89,8 @@ public class IdentifierFactory implements Serializable {
|
|||
}
|
||||
|
||||
public static Set<String> delegatedAuthorityDatasourceIds() {
|
||||
return DELEGATED_PID_AUTHORITY.values()
|
||||
return DELEGATED_PID_AUTHORITY
|
||||
.values()
|
||||
.stream()
|
||||
.flatMap(m -> m.keySet().stream())
|
||||
.collect(Collectors.toCollection(HashSet::new));
|
||||
|
@ -210,7 +213,6 @@ public class IdentifierFactory implements Serializable {
|
|||
.orElse(Stream.empty());
|
||||
}
|
||||
|
||||
|
||||
private static boolean shouldFilterPidByCriteria(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
|
||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
||||
|
||||
|
@ -220,13 +222,15 @@ public class IdentifierFactory implements Serializable {
|
|||
|
||||
boolean isEnrich = Optional
|
||||
.ofNullable(ENRICHMENT_PROVIDER.get(pType))
|
||||
.map(enrich -> enrich.containsKey(collectedFrom.getKey())
|
||||
.map(
|
||||
enrich -> enrich.containsKey(collectedFrom.getKey())
|
||||
|| enrich.containsValue(collectedFrom.getValue()))
|
||||
.orElse(false);
|
||||
|
||||
boolean isAuthority = Optional
|
||||
.ofNullable(PID_AUTHORITY.get(pType))
|
||||
.map(authorities -> authorities.containsKey(collectedFrom.getKey())
|
||||
.map(
|
||||
authorities -> authorities.containsKey(collectedFrom.getKey())
|
||||
|| authorities.containsValue(collectedFrom.getValue()))
|
||||
.orElse(false);
|
||||
|
||||
|
|
|
@ -1,22 +1,24 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
|
||||
import static com.google.common.base.Preconditions.checkArgument;
|
||||
import static org.apache.commons.lang3.ObjectUtils.firstNonNull;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.time.ZoneId;
|
||||
import java.util.*;
|
||||
import java.util.function.BinaryOperator;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.lang3.tuple.ImmutablePair;
|
||||
import org.apache.commons.lang3.tuple.Pair;
|
||||
|
||||
import com.github.sisyphsu.dateparser.DateParserUtils;
|
||||
import com.google.common.base.Joiner;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
|
@ -173,7 +175,8 @@ public class MergeUtils {
|
|||
return a || b;
|
||||
}
|
||||
|
||||
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust, Function<T, K> keyExtractor, BinaryOperator<T> merger) {
|
||||
private static <T, K> List<T> mergeLists(final List<T> left, final List<T> right, int trust,
|
||||
Function<T, K> keyExtractor, BinaryOperator<T> merger) {
|
||||
if (left == null) {
|
||||
return right;
|
||||
} else if (right == null) {
|
||||
|
@ -402,11 +405,12 @@ public class MergeUtils {
|
|||
// instance enrichment or union
|
||||
// review instance equals => add pid to comparision
|
||||
if (!isAnEnrichment(merge) && !isAnEnrichment(enrich))
|
||||
merge.setInstance(
|
||||
mergeLists(merge.getInstance(), enrich.getInstance(), trust,
|
||||
merge
|
||||
.setInstance(
|
||||
mergeLists(
|
||||
merge.getInstance(), enrich.getInstance(), trust,
|
||||
MergeUtils::instanceKeyExtractor,
|
||||
MergeUtils::instanceMerger
|
||||
));
|
||||
MergeUtils::instanceMerger));
|
||||
else {
|
||||
final List<Instance> enrichmentInstances = isAnEnrichment(merge) ? merge.getInstance()
|
||||
: enrich.getInstance();
|
||||
|
@ -428,12 +432,103 @@ public class MergeUtils {
|
|||
}
|
||||
|
||||
private static String instanceKeyExtractor(Instance i) {
|
||||
return String.join("::",
|
||||
return String
|
||||
.join(
|
||||
"::",
|
||||
kvKeyExtractor(i.getHostedby()),
|
||||
kvKeyExtractor(i.getCollectedfrom()),
|
||||
qualifierKeyExtractor(i.getAccessright()),
|
||||
qualifierKeyExtractor(i.getInstancetype()),
|
||||
Optional.ofNullable(i.getUrl()).map(u -> String.join("::", u)).orElse(null),
|
||||
Optional.ofNullable(i.getPid()).map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::"))).orElse(null));
|
||||
Optional
|
||||
.ofNullable(i.getPid())
|
||||
.map(pp -> pp.stream().map(MergeUtils::spKeyExtractor).collect(Collectors.joining("::")))
|
||||
.orElse(null));
|
||||
}
|
||||
|
||||
private static Instance instanceMerger(Instance i1, Instance i2) {
|
||||
Instance i = new Instance();
|
||||
i.setHostedby(i1.getHostedby());
|
||||
i.setCollectedfrom(i1.getCollectedfrom());
|
||||
i.setAccessright(i1.getAccessright());
|
||||
i.setInstancetype(i1.getInstancetype());
|
||||
i.setPid(mergeLists(i1.getPid(), i2.getPid(), 0, MergeUtils::spKeyExtractor, (sp1, sp2) -> sp1));
|
||||
i
|
||||
.setAlternateIdentifier(
|
||||
mergeLists(
|
||||
i1.getAlternateIdentifier(), i2.getAlternateIdentifier(), 0, MergeUtils::spKeyExtractor,
|
||||
(sp1, sp2) -> sp1));
|
||||
|
||||
i
|
||||
.setRefereed(
|
||||
Collections
|
||||
.min(
|
||||
Stream.of(i1.getRefereed(), i2.getRefereed()).collect(Collectors.toList()),
|
||||
new RefereedComparator()));
|
||||
i
|
||||
.setInstanceTypeMapping(
|
||||
mergeLists(
|
||||
i1.getInstanceTypeMapping(), i2.getInstanceTypeMapping(), 0,
|
||||
MergeUtils::instanceTypeMappingKeyExtractor, (itm1, itm2) -> itm1));
|
||||
i.setFulltext(selectFulltext(i1.getFulltext(), i2.getFulltext()));
|
||||
i.setDateofacceptance(selectOldestDate(i1.getDateofacceptance(), i2.getDateofacceptance()));
|
||||
i.setLicense(firstNonNull(i1.getLicense(), i2.getLicense()));
|
||||
i.setProcessingchargeamount(firstNonNull(i1.getProcessingchargeamount(), i2.getProcessingchargeamount()));
|
||||
i.setProcessingchargecurrency(firstNonNull(i1.getProcessingchargecurrency(), i2.getProcessingchargecurrency()));
|
||||
i
|
||||
.setMeasures(
|
||||
mergeLists(i1.getMeasures(), i2.getMeasures(), 0, MergeUtils::measureKeyExtractor, (m1, m2) -> m1));
|
||||
|
||||
i.setUrl(unionDistinctListOfString(i1.getUrl(), i2.getUrl()));
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
private static String measureKeyExtractor(Measure m) {
|
||||
return String
|
||||
.join(
|
||||
"::",
|
||||
m.getId(),
|
||||
m
|
||||
.getUnit()
|
||||
.stream()
|
||||
.map(KeyValue::getKey)
|
||||
.collect(Collectors.joining("::")));
|
||||
}
|
||||
|
||||
private static Field<String> selectOldestDate(Field<String> d1, Field<String> d2) {
|
||||
return Stream
|
||||
.of(d1, d2)
|
||||
.filter(Objects::nonNull)
|
||||
.min(
|
||||
Comparator
|
||||
.comparing(
|
||||
f -> DateParserUtils
|
||||
.parseDate(f.getValue())
|
||||
.toInstant()
|
||||
.atZone(ZoneId.systemDefault())
|
||||
.toLocalDate()))
|
||||
.orElse(d1);
|
||||
}
|
||||
|
||||
private static String selectFulltext(String ft1, String ft2) {
|
||||
if (StringUtils.endsWith(ft1, "pdf")) {
|
||||
return ft1;
|
||||
}
|
||||
if (StringUtils.endsWith(ft2, "pdf")) {
|
||||
return ft2;
|
||||
}
|
||||
return firstNonNull(ft1, ft2);
|
||||
}
|
||||
|
||||
private static String instanceTypeMappingKeyExtractor(InstanceTypeMapping itm) {
|
||||
return String
|
||||
.join(
|
||||
"::",
|
||||
itm.getOriginalType(),
|
||||
itm.getTypeCode(),
|
||||
itm.getTypeLabel(),
|
||||
itm.getVocabularyName());
|
||||
}
|
||||
|
||||
private static String kvKeyExtractor(KeyValue kv) {
|
||||
|
@ -444,22 +539,17 @@ public class MergeUtils {
|
|||
return Optional.ofNullable(q).map(Qualifier::getClassid).orElse(null);
|
||||
}
|
||||
|
||||
private static <T> T FieldKeyExtractor(Field<T> f) {
|
||||
private static <T> T fieldKeyExtractor(Field<T> f) {
|
||||
return Optional.ofNullable(f).map(Field::getValue).orElse(null);
|
||||
}
|
||||
|
||||
private static String spKeyExtractor(StructuredProperty sp) {
|
||||
return Optional.ofNullable(sp).map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier()))).orElse(null);
|
||||
return Optional
|
||||
.ofNullable(sp)
|
||||
.map(s -> Joiner.on("::").join(s, qualifierKeyExtractor(s.getQualifier())))
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static Instance instanceMerger(Instance i1, Instance i2) {
|
||||
|
||||
// TODO implement me!
|
||||
|
||||
return i1;
|
||||
}
|
||||
|
||||
|
||||
private static <T extends OtherResearchProduct> T mergeORP(T original, T enrich) {
|
||||
int trust = compareTrust(original, enrich);
|
||||
final T merge = mergeResult(original, enrich);
|
||||
|
|
|
@ -3,7 +3,8 @@ package eu.dnetlib.dhp.schema.oaf.utils;
|
|||
|
||||
public class ModelHardLimits {
|
||||
|
||||
private ModelHardLimits() {}
|
||||
private ModelHardLimits() {
|
||||
}
|
||||
|
||||
public static final String LAYOUT = "index";
|
||||
public static final String INTERPRETATION = "openaire";
|
||||
|
|
|
@ -1,10 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,14 +1,15 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class PidBlacklistProvider {
|
||||
|
||||
private static final PidBlacklist blacklist;
|
||||
|
@ -33,6 +34,7 @@ public class PidBlacklistProvider {
|
|||
.orElse(new HashSet<>());
|
||||
}
|
||||
|
||||
private PidBlacklistProvider() {}
|
||||
private PidBlacklistProvider() {
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
|
||||
|
||||
private final T entity;
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Optional;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class PidValueComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -0,0 +1,39 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
/**
|
||||
* Comparator for sorting the values from the dnet:review_levels vocabulary, implements the following ordering
|
||||
*
|
||||
* peerReviewed (0001) > nonPeerReviewed (0002) > UNKNOWN (0000)
|
||||
*/
|
||||
public class RefereedComparator implements Comparator<Qualifier> {
|
||||
|
||||
@Override
|
||||
public int compare(Qualifier left, Qualifier right) {
|
||||
|
||||
String lClass = left.getClassid();
|
||||
String rClass = right.getClassid();
|
||||
|
||||
if ("0001".equals(lClass))
|
||||
return -1;
|
||||
if ("0001".equals(rClass))
|
||||
return 1;
|
||||
|
||||
if ("0002".equals(lClass))
|
||||
return -1;
|
||||
if ("0002".equals(rClass))
|
||||
return 1;
|
||||
|
||||
if ("0000".equals(lClass))
|
||||
return -1;
|
||||
if ("0000".equals(rClass))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
|
@ -1,10 +1,10 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class ResultPidComparator implements Comparator<StructuredProperty> {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -1,16 +1,16 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.HashSet;
|
||||
import java.util.Optional;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
|
||||
public class ResultTypeComparator implements Comparator<Result> {
|
||||
|
||||
|
|
|
@ -1,11 +1,11 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
class BlackListProviderTest {
|
||||
|
||||
@Test
|
||||
|
|
|
@ -1,16 +1,18 @@
|
|||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
|
||||
class IdentifierFactoryTest {
|
||||
|
||||
|
|
|
@ -10,24 +10,23 @@ import java.util.HashSet;
|
|||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.beanutils.BeanUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
|
||||
public class MergeUtilsTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
|
||||
@Test
|
||||
void testMergePubs_new() throws IOException {
|
||||
Publication pt = read("publication_test.json", Publication.class);
|
||||
|
|
Loading…
Reference in New Issue