forked from D-Net/dnet-hadoop
core utilities in dhp-common moved in external module dhp-schemas
This commit is contained in:
parent
ac77a245a3
commit
5afa7d3e0c
|
@ -10,8 +10,8 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
|
||||||
public class Vocabulary implements Serializable {
|
public class Vocabulary implements Serializable {
|
||||||
|
|
||||||
|
|
|
@ -7,8 +7,8 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
|
|
@ -1,399 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import com.clearspring.analytics.util.Lists;
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidBlacklistProvider;
|
|
||||||
|
|
||||||
public class CleaningFunctions {
|
|
||||||
|
|
||||||
public static final String DOI_PREFIX_REGEX = "(^10\\.|\\/10.)";
|
|
||||||
public static final String DOI_PREFIX = "10.";
|
|
||||||
|
|
||||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
|
||||||
public static final String CLEANING_REGEX = "(?:\\n|\\r|\\t)";
|
|
||||||
|
|
||||||
public static final Set<String> PID_BLACKLIST = new HashSet<>();
|
|
||||||
public static final String INVALID_AUTHOR_REGEX = ".*deactivated.*";
|
|
||||||
public static final String TITLE_FILTER_REGEX = "[.*test.*\\W\\d]";
|
|
||||||
public static final int TITLE_FILTER_RESIDUAL_LENGTH = 10;
|
|
||||||
|
|
||||||
static {
|
|
||||||
PID_BLACKLIST.add("none");
|
|
||||||
PID_BLACKLIST.add("na");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
|
||||||
if (value instanceof Datasource) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Project) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Organization) {
|
|
||||||
Organization o = (Organization) value;
|
|
||||||
if (Objects.nonNull(o.getCountry())) {
|
|
||||||
fixVocabName(o.getCountry(), ModelConstants.DNET_COUNTRY_TYPE);
|
|
||||||
}
|
|
||||||
} else if (value instanceof Relation) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Result) {
|
|
||||||
|
|
||||||
Result r = (Result) value;
|
|
||||||
|
|
||||||
fixVocabName(r.getLanguage(), ModelConstants.DNET_LANGUAGES);
|
|
||||||
fixVocabName(r.getResourcetype(), ModelConstants.DNET_DATA_CITE_RESOURCE);
|
|
||||||
fixVocabName(r.getBestaccessright(), ModelConstants.DNET_ACCESS_MODES);
|
|
||||||
|
|
||||||
if (Objects.nonNull(r.getSubject())) {
|
|
||||||
r.getSubject().forEach(s -> fixVocabName(s.getQualifier(), ModelConstants.DNET_SUBJECT_TYPOLOGIES));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getInstance())) {
|
|
||||||
for (Instance i : r.getInstance()) {
|
|
||||||
fixVocabName(i.getAccessright(), ModelConstants.DNET_ACCESS_MODES);
|
|
||||||
fixVocabName(i.getRefereed(), ModelConstants.DNET_REVIEW_LEVELS);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getAuthor())) {
|
|
||||||
r.getAuthor().stream().filter(Objects::nonNull).forEach(a -> {
|
|
||||||
if (Objects.nonNull(a.getPid())) {
|
|
||||||
a.getPid().stream().filter(Objects::nonNull).forEach(p -> {
|
|
||||||
fixVocabName(p.getQualifier(), ModelConstants.DNET_PID_TYPES);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
if (value instanceof Publication) {
|
|
||||||
|
|
||||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
|
||||||
|
|
||||||
} else if (value instanceof OtherResearchProduct) {
|
|
||||||
|
|
||||||
} else if (value instanceof Software) {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends Oaf> boolean filter(T value) {
|
|
||||||
if (value instanceof Datasource) {
|
|
||||||
// nothing to evaluate here
|
|
||||||
} else if (value instanceof Project) {
|
|
||||||
// nothing to evaluate here
|
|
||||||
} else if (value instanceof Organization) {
|
|
||||||
// nothing to evaluate here
|
|
||||||
} else if (value instanceof Relation) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Result) {
|
|
||||||
|
|
||||||
Result r = (Result) value;
|
|
||||||
|
|
||||||
if (Objects.nonNull(r.getTitle()) && r.getTitle().isEmpty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (value instanceof Publication) {
|
|
||||||
|
|
||||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
|
||||||
|
|
||||||
} else if (value instanceof OtherResearchProduct) {
|
|
||||||
|
|
||||||
} else if (value instanceof Software) {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends Oaf> T cleanup(T value) {
|
|
||||||
if (value instanceof Datasource) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Project) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Organization) {
|
|
||||||
Organization o = (Organization) value;
|
|
||||||
if (Objects.isNull(o.getCountry()) || StringUtils.isBlank(o.getCountry().getClassid())) {
|
|
||||||
o.setCountry(ModelConstants.UNKNOWN_COUNTRY);
|
|
||||||
}
|
|
||||||
} else if (value instanceof Relation) {
|
|
||||||
// nothing to clean here
|
|
||||||
} else if (value instanceof Result) {
|
|
||||||
|
|
||||||
Result r = (Result) value;
|
|
||||||
if (Objects.nonNull(r.getPublisher()) && StringUtils.isBlank(r.getPublisher().getValue())) {
|
|
||||||
r.setPublisher(null);
|
|
||||||
}
|
|
||||||
if (Objects.isNull(r.getLanguage()) || StringUtils.isBlank(r.getLanguage().getClassid())) {
|
|
||||||
r
|
|
||||||
.setLanguage(
|
|
||||||
qualifier("und", "Undetermined", ModelConstants.DNET_LANGUAGES));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getSubject())) {
|
|
||||||
r
|
|
||||||
.setSubject(
|
|
||||||
r
|
|
||||||
.getSubject()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
|
||||||
.map(CleaningFunctions::cleanValue)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getTitle())) {
|
|
||||||
r
|
|
||||||
.setTitle(
|
|
||||||
r
|
|
||||||
.getTitle()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
|
||||||
.filter(
|
|
||||||
sp -> sp
|
|
||||||
.getValue()
|
|
||||||
.toLowerCase()
|
|
||||||
.replaceAll(TITLE_FILTER_REGEX, "")
|
|
||||||
.length() > TITLE_FILTER_RESIDUAL_LENGTH)
|
|
||||||
.map(CleaningFunctions::cleanValue)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getDescription())) {
|
|
||||||
r
|
|
||||||
.setDescription(
|
|
||||||
r
|
|
||||||
.getDescription()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getValue()))
|
|
||||||
.map(CleaningFunctions::cleanValue)
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getPid())) {
|
|
||||||
r.setPid(processPidCleaning(r.getPid()));
|
|
||||||
}
|
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
|
||||||
r
|
|
||||||
.setResourcetype(
|
|
||||||
qualifier(ModelConstants.UNKNOWN, "Unknown", ModelConstants.DNET_DATA_CITE_RESOURCE));
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getInstance())) {
|
|
||||||
|
|
||||||
for (Instance i : r.getInstance()) {
|
|
||||||
Optional
|
|
||||||
.ofNullable(i.getPid())
|
|
||||||
.ifPresent(pid -> {
|
|
||||||
final Set<StructuredProperty> pids = pid
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
|
|
||||||
Optional
|
|
||||||
.ofNullable(i.getAlternateIdentifier())
|
|
||||||
.ifPresent(altId -> {
|
|
||||||
final Set<StructuredProperty> altIds = altId
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
|
||||||
.collect(Collectors.toCollection(HashSet::new));
|
|
||||||
|
|
||||||
i.setAlternateIdentifier(Lists.newArrayList(Sets.difference(altIds, pids)));
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
if (Objects.isNull(i.getAccessright()) || StringUtils.isBlank(i.getAccessright().getClassid())) {
|
|
||||||
i
|
|
||||||
.setAccessright(
|
|
||||||
accessRight(
|
|
||||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES));
|
|
||||||
}
|
|
||||||
if (Objects.isNull(i.getHostedby()) || StringUtils.isBlank(i.getHostedby().getKey())) {
|
|
||||||
i.setHostedby(ModelConstants.UNKNOWN_REPOSITORY);
|
|
||||||
}
|
|
||||||
if (Objects.isNull(i.getRefereed())) {
|
|
||||||
i.setRefereed(qualifier("0000", "Unknown", ModelConstants.DNET_REVIEW_LEVELS));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
|
||||||
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
|
|
||||||
if (Objects.isNull(bestaccessrights)) {
|
|
||||||
r
|
|
||||||
.setBestaccessright(
|
|
||||||
qualifier(
|
|
||||||
ModelConstants.UNKNOWN, ModelConstants.NOT_AVAILABLE,
|
|
||||||
ModelConstants.DNET_ACCESS_MODES));
|
|
||||||
} else {
|
|
||||||
r.setBestaccessright(bestaccessrights);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (Objects.nonNull(r.getAuthor())) {
|
|
||||||
final List<Author> authors = Lists.newArrayList();
|
|
||||||
for (Author a : r.getAuthor()) {
|
|
||||||
if (Objects.isNull(a.getPid())) {
|
|
||||||
a.setPid(Lists.newArrayList());
|
|
||||||
} else {
|
|
||||||
a
|
|
||||||
.setPid(
|
|
||||||
a
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(p -> Objects.nonNull(p.getQualifier()))
|
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
|
||||||
.map(p -> {
|
|
||||||
p.setValue(p.getValue().trim().replaceAll(ORCID_PREFIX_REGEX, ""));
|
|
||||||
return p;
|
|
||||||
})
|
|
||||||
.filter(p -> StringUtils.isNotBlank(p.getValue()))
|
|
||||||
.collect(
|
|
||||||
Collectors
|
|
||||||
.toMap(
|
|
||||||
StructuredProperty::getValue, Function.identity(), (p1, p2) -> p1,
|
|
||||||
LinkedHashMap::new))
|
|
||||||
.values()
|
|
||||||
.stream()
|
|
||||||
.collect(Collectors.toList()));
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(a.getFullname())) {
|
|
||||||
if (StringUtils.isNotBlank(a.getName()) && StringUtils.isNotBlank(a.getSurname())) {
|
|
||||||
a.setFullname(a.getSurname() + ", " + a.getName());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (StringUtils.isNotBlank(a.getFullname()) && isValidAuthorName(a)) {
|
|
||||||
authors.add(a);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean nullRank = authors
|
|
||||||
.stream()
|
|
||||||
.anyMatch(a -> Objects.isNull(a.getRank()));
|
|
||||||
if (nullRank) {
|
|
||||||
int i = 1;
|
|
||||||
for (Author author : authors) {
|
|
||||||
author.setRank(i++);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
r.setAuthor(authors);
|
|
||||||
|
|
||||||
}
|
|
||||||
if (value instanceof Publication) {
|
|
||||||
|
|
||||||
} else if (value instanceof eu.dnetlib.dhp.schema.oaf.Dataset) {
|
|
||||||
|
|
||||||
} else if (value instanceof OtherResearchProduct) {
|
|
||||||
|
|
||||||
} else if (value instanceof Software) {
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return value;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isValidAuthorName(Author a) {
|
|
||||||
return !Stream
|
|
||||||
.of(a.getFullname(), a.getName(), a.getSurname())
|
|
||||||
.filter(s -> s != null && !s.isEmpty())
|
|
||||||
.collect(Collectors.joining(""))
|
|
||||||
.toLowerCase()
|
|
||||||
.matches(INVALID_AUTHOR_REGEX);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static List<StructuredProperty> processPidCleaning(List<StructuredProperty> pids) {
|
|
||||||
return pids
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
|
||||||
.filter(sp -> !PID_BLACKLIST.contains(sp.getValue().trim().toLowerCase()))
|
|
||||||
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
|
||||||
.filter(CleaningFunctions::pidFilter)
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static StructuredProperty cleanValue(StructuredProperty s) {
|
|
||||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static Field<String> cleanValue(Field<String> s) {
|
|
||||||
s.setValue(s.getValue().replaceAll(CLEANING_REGEX, " "));
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
|
|
||||||
// HELPERS
|
|
||||||
|
|
||||||
private static void fixVocabName(Qualifier q, String vocabularyName) {
|
|
||||||
if (Objects.nonNull(q) && StringUtils.isBlank(q.getSchemeid())) {
|
|
||||||
q.setSchemeid(vocabularyName);
|
|
||||||
q.setSchemename(vocabularyName);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static AccessRight accessRight(String classid, String classname, String scheme) {
|
|
||||||
return OafMapperUtils
|
|
||||||
.accessRight(
|
|
||||||
classid, classname, scheme, scheme);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Qualifier qualifier(String classid, String classname, String scheme) {
|
|
||||||
return OafMapperUtils
|
|
||||||
.qualifier(
|
|
||||||
classid, classname, scheme, scheme);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility method that filter PID values on a per-type basis.
|
|
||||||
* @param s the PID whose value will be checked.
|
|
||||||
* @return false if the pid matches the filter criteria, true otherwise.
|
|
||||||
*/
|
|
||||||
public static boolean pidFilter(StructuredProperty s) {
|
|
||||||
final String pidValue = s.getValue();
|
|
||||||
if (Objects.isNull(s.getQualifier()) ||
|
|
||||||
StringUtils.isBlank(pidValue) ||
|
|
||||||
StringUtils.isBlank(pidValue.replaceAll("(?:\\n|\\r|\\t|\\s)", ""))) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (CleaningFunctions.PID_BLACKLIST.contains(pidValue)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
if (PidBlacklistProvider.getBlacklist(s.getQualifier().getClassid()).contains(pidValue)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Utility method that normalises PID values on a per-type basis.
|
|
||||||
* @param pid the PID whose value will be normalised.
|
|
||||||
* @return the PID containing the normalised value.
|
|
||||||
*/
|
|
||||||
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
|
||||||
String value = Optional
|
|
||||||
.ofNullable(pid.getValue())
|
|
||||||
.map(String::trim)
|
|
||||||
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
|
||||||
switch (pid.getQualifier().getClassid()) {
|
|
||||||
|
|
||||||
// TODO add cleaning for more PID types as needed
|
|
||||||
case "doi":
|
|
||||||
pid.setValue(value.toLowerCase().replaceFirst(DOI_PREFIX_REGEX, DOI_PREFIX));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
return pid;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,22 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
public class ModelHardLimits {
|
|
||||||
|
|
||||||
public static final String LAYOUT = "index";
|
|
||||||
public static final String INTERPRETATION = "openaire";
|
|
||||||
public static final String SEPARATOR = "-";
|
|
||||||
|
|
||||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
|
||||||
public static final int MAX_AUTHORS = 200;
|
|
||||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
|
||||||
public static final int MAX_TITLE_LENGTH = 5000;
|
|
||||||
public static final int MAX_TITLES = 10;
|
|
||||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
|
||||||
public static final int MAX_INSTANCES = 10;
|
|
||||||
|
|
||||||
public static String getCollectionName(String format) {
|
|
||||||
return format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,364 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|
||||||
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.AccessRightComparator;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
public class OafMapperUtils {
|
|
||||||
|
|
||||||
public static Oaf merge(final Oaf left, final Oaf right) {
|
|
||||||
if (ModelSupport.isSubClass(left, OafEntity.class)) {
|
|
||||||
return mergeEntities((OafEntity) left, (OafEntity) right);
|
|
||||||
} else if (ModelSupport.isSubClass(left, Relation.class)) {
|
|
||||||
((Relation) left).mergeFrom((Relation) right);
|
|
||||||
} else {
|
|
||||||
throw new RuntimeException("invalid Oaf type:" + left.getClass().getCanonicalName());
|
|
||||||
}
|
|
||||||
return left;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static OafEntity mergeEntities(OafEntity left, OafEntity right) {
|
|
||||||
if (ModelSupport.isSubClass(left, Result.class)) {
|
|
||||||
return mergeResults((Result) left, (Result) right);
|
|
||||||
} else if (ModelSupport.isSubClass(left, Datasource.class)) {
|
|
||||||
((Datasource) left).mergeFrom((Datasource) right);
|
|
||||||
} else if (ModelSupport.isSubClass(left, Organization.class)) {
|
|
||||||
((Organization) left).mergeFrom((Organization) right);
|
|
||||||
} else if (ModelSupport.isSubClass(left, Project.class)) {
|
|
||||||
((Project) left).mergeFrom((Project) right);
|
|
||||||
} else {
|
|
||||||
throw new RuntimeException("invalid OafEntity subtype:" + left.getClass().getCanonicalName());
|
|
||||||
}
|
|
||||||
return left;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Result mergeResults(Result left, Result right) {
|
|
||||||
if (new ResultTypeComparator().compare(left, right) < 0) {
|
|
||||||
left.mergeFrom(right);
|
|
||||||
return left;
|
|
||||||
} else {
|
|
||||||
right.mergeFrom(left);
|
|
||||||
return right;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static KeyValue keyValue(final String k, final String v) {
|
|
||||||
final KeyValue kv = new KeyValue();
|
|
||||||
kv.setKey(k);
|
|
||||||
kv.setValue(v);
|
|
||||||
return kv;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<KeyValue> listKeyValues(final String... s) {
|
|
||||||
if (s.length % 2 > 0) {
|
|
||||||
throw new RuntimeException("Invalid number of parameters (k,v,k,v,....)");
|
|
||||||
}
|
|
||||||
|
|
||||||
final List<KeyValue> list = new ArrayList<>();
|
|
||||||
for (int i = 0; i < s.length; i += 2) {
|
|
||||||
list.add(keyValue(s[i], s[i + 1]));
|
|
||||||
}
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T> Field<T> field(final T value, final DataInfo info) {
|
|
||||||
if (value == null || StringUtils.isBlank(value.toString())) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Field<T> field = new Field<>();
|
|
||||||
field.setValue(value);
|
|
||||||
field.setDataInfo(info);
|
|
||||||
return field;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<Field<String>> listFields(final DataInfo info, final String... values) {
|
|
||||||
return Arrays
|
|
||||||
.stream(values)
|
|
||||||
.map(v -> field(v, info))
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(distinctByKey(f -> f.getValue()))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<Field<String>> listFields(final DataInfo info, final List<String> values) {
|
|
||||||
return values
|
|
||||||
.stream()
|
|
||||||
.map(v -> field(v, info))
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(distinctByKey(f -> f.getValue()))
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Qualifier unknown(final String schemeid, final String schemename) {
|
|
||||||
return qualifier("UNKNOWN", "Unknown", schemeid, schemename);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static AccessRight accessRight(
|
|
||||||
final String classid,
|
|
||||||
final String classname,
|
|
||||||
final String schemeid,
|
|
||||||
final String schemename) {
|
|
||||||
return accessRight(classid, classname, schemeid, schemename, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static AccessRight accessRight(
|
|
||||||
final String classid,
|
|
||||||
final String classname,
|
|
||||||
final String schemeid,
|
|
||||||
final String schemename,
|
|
||||||
final OpenAccessRoute openAccessRoute) {
|
|
||||||
final AccessRight accessRight = new AccessRight();
|
|
||||||
accessRight.setClassid(classid);
|
|
||||||
accessRight.setClassname(classname);
|
|
||||||
accessRight.setSchemeid(schemeid);
|
|
||||||
accessRight.setSchemename(schemename);
|
|
||||||
accessRight.setOpenAccessRoute(openAccessRoute);
|
|
||||||
return accessRight;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Qualifier qualifier(
|
|
||||||
final String classid,
|
|
||||||
final String classname,
|
|
||||||
final String schemeid,
|
|
||||||
final String schemename) {
|
|
||||||
final Qualifier q = new Qualifier();
|
|
||||||
q.setClassid(classid);
|
|
||||||
q.setClassname(classname);
|
|
||||||
q.setSchemeid(schemeid);
|
|
||||||
q.setSchemename(schemename);
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Qualifier qualifier(final Qualifier qualifier) {
|
|
||||||
final Qualifier q = new Qualifier();
|
|
||||||
q.setClassid(qualifier.getClassid());
|
|
||||||
q.setClassname(qualifier.getClassname());
|
|
||||||
q.setSchemeid(qualifier.getSchemeid());
|
|
||||||
q.setSchemename(qualifier.getSchemename());
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static StructuredProperty structuredProperty(
|
|
||||||
final String value,
|
|
||||||
final String classid,
|
|
||||||
final String classname,
|
|
||||||
final String schemeid,
|
|
||||||
final String schemename,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
|
|
||||||
return structuredProperty(value, qualifier(classid, classname, schemeid, schemename), dataInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static StructuredProperty structuredProperty(
|
|
||||||
final String value,
|
|
||||||
final Qualifier qualifier,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
if (value == null) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
final StructuredProperty sp = new StructuredProperty();
|
|
||||||
sp.setValue(value);
|
|
||||||
sp.setQualifier(qualifier);
|
|
||||||
sp.setDataInfo(dataInfo);
|
|
||||||
return sp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ExtraInfo extraInfo(
|
|
||||||
final String name,
|
|
||||||
final String value,
|
|
||||||
final String typology,
|
|
||||||
final String provenance,
|
|
||||||
final String trust) {
|
|
||||||
final ExtraInfo info = new ExtraInfo();
|
|
||||||
info.setName(name);
|
|
||||||
info.setValue(value);
|
|
||||||
info.setTypology(typology);
|
|
||||||
info.setProvenance(provenance);
|
|
||||||
info.setTrust(trust);
|
|
||||||
return info;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static OAIProvenance oaiIProvenance(
|
|
||||||
final String identifier,
|
|
||||||
final String baseURL,
|
|
||||||
final String metadataNamespace,
|
|
||||||
final Boolean altered,
|
|
||||||
final String datestamp,
|
|
||||||
final String harvestDate) {
|
|
||||||
|
|
||||||
final OriginDescription desc = new OriginDescription();
|
|
||||||
desc.setIdentifier(identifier);
|
|
||||||
desc.setBaseURL(baseURL);
|
|
||||||
desc.setMetadataNamespace(metadataNamespace);
|
|
||||||
desc.setAltered(altered);
|
|
||||||
desc.setDatestamp(datestamp);
|
|
||||||
desc.setHarvestDate(harvestDate);
|
|
||||||
|
|
||||||
final OAIProvenance p = new OAIProvenance();
|
|
||||||
p.setOriginDescription(desc);
|
|
||||||
|
|
||||||
return p;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Journal journal(
|
|
||||||
final String name,
|
|
||||||
final String issnPrinted,
|
|
||||||
final String issnOnline,
|
|
||||||
final String issnLinking,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
return journal(
|
|
||||||
name,
|
|
||||||
issnPrinted,
|
|
||||||
issnOnline,
|
|
||||||
issnLinking,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
null,
|
|
||||||
dataInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Journal journal(
|
|
||||||
final String name,
|
|
||||||
final String issnPrinted,
|
|
||||||
final String issnOnline,
|
|
||||||
final String issnLinking,
|
|
||||||
final String ep,
|
|
||||||
final String iss,
|
|
||||||
final String sp,
|
|
||||||
final String vol,
|
|
||||||
final String edition,
|
|
||||||
final String conferenceplace,
|
|
||||||
final String conferencedate,
|
|
||||||
final DataInfo dataInfo) {
|
|
||||||
|
|
||||||
if (StringUtils.isNotBlank(name)
|
|
||||||
|| StringUtils.isNotBlank(issnPrinted)
|
|
||||||
|| StringUtils.isNotBlank(issnOnline)
|
|
||||||
|| StringUtils.isNotBlank(issnLinking)) {
|
|
||||||
final Journal j = new Journal();
|
|
||||||
j.setName(name);
|
|
||||||
j.setIssnPrinted(issnPrinted);
|
|
||||||
j.setIssnOnline(issnOnline);
|
|
||||||
j.setIssnLinking(issnLinking);
|
|
||||||
j.setEp(ep);
|
|
||||||
j.setIss(iss);
|
|
||||||
j.setSp(sp);
|
|
||||||
j.setVol(vol);
|
|
||||||
j.setEdition(edition);
|
|
||||||
j.setConferenceplace(conferenceplace);
|
|
||||||
j.setConferencedate(conferencedate);
|
|
||||||
j.setDataInfo(dataInfo);
|
|
||||||
return j;
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static DataInfo dataInfo(
|
|
||||||
final Boolean deletedbyinference,
|
|
||||||
final String inferenceprovenance,
|
|
||||||
final Boolean inferred,
|
|
||||||
final Boolean invisible,
|
|
||||||
final Qualifier provenanceaction,
|
|
||||||
final String trust) {
|
|
||||||
final DataInfo d = new DataInfo();
|
|
||||||
d.setDeletedbyinference(deletedbyinference);
|
|
||||||
d.setInferenceprovenance(inferenceprovenance);
|
|
||||||
d.setInferred(inferred);
|
|
||||||
d.setInvisible(invisible);
|
|
||||||
d.setProvenanceaction(provenanceaction);
|
|
||||||
d.setTrust(trust);
|
|
||||||
return d;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String createOpenaireId(
|
|
||||||
final int prefix,
|
|
||||||
final String originalId,
|
|
||||||
final boolean to_md5) {
|
|
||||||
if (StringUtils.isBlank(originalId)) {
|
|
||||||
return null;
|
|
||||||
} else if (to_md5) {
|
|
||||||
final String nsPrefix = StringUtils.substringBefore(originalId, "::");
|
|
||||||
final String rest = StringUtils.substringAfter(originalId, "::");
|
|
||||||
return String.format("%s|%s::%s", prefix, nsPrefix, DHPUtils.md5(rest));
|
|
||||||
} else {
|
|
||||||
return String.format("%s|%s", prefix, originalId);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String createOpenaireId(
|
|
||||||
final String type,
|
|
||||||
final String originalId,
|
|
||||||
final boolean to_md5) {
|
|
||||||
switch (type) {
|
|
||||||
case "datasource":
|
|
||||||
return createOpenaireId(10, originalId, to_md5);
|
|
||||||
case "organization":
|
|
||||||
return createOpenaireId(20, originalId, to_md5);
|
|
||||||
case "person":
|
|
||||||
return createOpenaireId(30, originalId, to_md5);
|
|
||||||
case "project":
|
|
||||||
return createOpenaireId(40, originalId, to_md5);
|
|
||||||
default:
|
|
||||||
return createOpenaireId(50, originalId, to_md5);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String asString(final Object o) {
|
|
||||||
return o == null ? "" : o.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T> Predicate<T> distinctByKey(
|
|
||||||
final Function<? super T, ?> keyExtractor) {
|
|
||||||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
|
||||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
|
||||||
return getBestAccessRights(instanceList);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
|
||||||
if (instanceList != null) {
|
|
||||||
final Optional<AccessRight> min = instanceList
|
|
||||||
.stream()
|
|
||||||
.map(i -> i.getAccessright())
|
|
||||||
.min(new AccessRightComparator<>());
|
|
||||||
|
|
||||||
final Qualifier rights = min.isPresent() ? qualifier(min.get()) : new Qualifier();
|
|
||||||
|
|
||||||
if (StringUtils.isBlank(rights.getClassid())) {
|
|
||||||
rights.setClassid(UNKNOWN);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getClassname())
|
|
||||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
|
||||||
rights.setClassname(NOT_AVAILABLE);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
|
||||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
|
||||||
rights.setSchemename(DNET_ACCESS_MODES);
|
|
||||||
}
|
|
||||||
|
|
||||||
return rights;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,78 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.CROSSREF_ID;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
|
|
||||||
public class ResultTypeComparator implements Comparator<Result> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(Result left, Result right) {
|
|
||||||
|
|
||||||
if (left == null && right == null)
|
|
||||||
return 0;
|
|
||||||
if (left == null)
|
|
||||||
return 1;
|
|
||||||
if (right == null)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
HashSet<String> lCf = getCollectedFromIds(left);
|
|
||||||
HashSet<String> rCf = getCollectedFromIds(right);
|
|
||||||
|
|
||||||
if (lCf.contains(CROSSREF_ID) && !rCf.contains(CROSSREF_ID)) {
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
if (!lCf.contains(CROSSREF_ID) && rCf.contains(CROSSREF_ID)) {
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
String lClass = left.getResulttype().getClassid();
|
|
||||||
String rClass = right.getResulttype().getClassid();
|
|
||||||
|
|
||||||
if (lClass.equals(rClass))
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
// Else (but unlikely), lexicographical ordering will do.
|
|
||||||
return lClass.compareTo(rClass);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected HashSet<String> getCollectedFromIds(Result left) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(left.getCollectedfrom())
|
|
||||||
.map(
|
|
||||||
cf -> cf
|
|
||||||
.stream()
|
|
||||||
.map(c -> c.getKey())
|
|
||||||
.collect(Collectors.toCollection(HashSet::new)))
|
|
||||||
.orElse(new HashSet<>());
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,212 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import static com.google.common.base.Preconditions.checkArgument;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
|
||||||
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.util.*;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.collect.HashBiMap;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Factory class for OpenAIRE identifiers in the Graph
|
|
||||||
*/
|
|
||||||
public class IdentifierFactory implements Serializable {
|
|
||||||
|
|
||||||
public static final String ID_SEPARATOR = "::";
|
|
||||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
|
||||||
|
|
||||||
public static final int ID_PREFIX_LEN = 12;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Declares the associations PID_TYPE -> [DATASOURCE ID, NAME] considered authoritative for that PID_TYPE
|
|
||||||
*/
|
|
||||||
public static final Map<PidType, HashBiMap<String, String>> PID_AUTHORITY = Maps.newHashMap();
|
|
||||||
|
|
||||||
static {
|
|
||||||
PID_AUTHORITY.put(PidType.doi, HashBiMap.create());
|
|
||||||
PID_AUTHORITY.get(PidType.doi).put(CROSSREF_ID, "Crossref");
|
|
||||||
PID_AUTHORITY.get(PidType.doi).put(DATACITE_ID, "Datacite");
|
|
||||||
|
|
||||||
PID_AUTHORITY.put(PidType.pmc, HashBiMap.create());
|
|
||||||
PID_AUTHORITY.get(PidType.pmc).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
|
||||||
PID_AUTHORITY.get(PidType.pmc).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
|
||||||
|
|
||||||
PID_AUTHORITY.put(PidType.pmid, HashBiMap.create());
|
|
||||||
PID_AUTHORITY.get(PidType.pmid).put(EUROPE_PUBMED_CENTRAL_ID, "Europe PubMed Central");
|
|
||||||
PID_AUTHORITY.get(PidType.pmid).put(PUBMED_CENTRAL_ID, "PubMed Central");
|
|
||||||
|
|
||||||
PID_AUTHORITY.put(PidType.arXiv, HashBiMap.create());
|
|
||||||
PID_AUTHORITY.get(PidType.arXiv).put(ARXIV_ID, "arXiv.org e-Print Archive");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static List<StructuredProperty> getPids(List<StructuredProperty> pid, KeyValue collectedFrom) {
|
|
||||||
return pidFromInstance(pid, collectedFrom, true).distinct().collect(Collectors.toList());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static <T extends Result> String createDOIBoostIdentifier(T entity) {
|
|
||||||
if (entity == null)
|
|
||||||
return null;
|
|
||||||
|
|
||||||
StructuredProperty pid = null;
|
|
||||||
if (entity.getPid() != null) {
|
|
||||||
pid = entity
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(Objects::nonNull)
|
|
||||||
.filter(s -> s.getQualifier() != null && "doi".equalsIgnoreCase(s.getQualifier().getClassid()))
|
|
||||||
.filter(CleaningFunctions::pidFilter)
|
|
||||||
.findAny()
|
|
||||||
.orElse(null);
|
|
||||||
} else {
|
|
||||||
if (entity.getInstance() != null) {
|
|
||||||
pid = entity
|
|
||||||
.getInstance()
|
|
||||||
.stream()
|
|
||||||
.filter(i -> i.getPid() != null)
|
|
||||||
.flatMap(i -> i.getPid().stream())
|
|
||||||
.filter(CleaningFunctions::pidFilter)
|
|
||||||
.findAny()
|
|
||||||
.orElse(null);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (pid != null)
|
|
||||||
return idFromPid(entity, pid, true);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates an identifier from the most relevant PID (if available) provided by a known PID authority in the given
|
|
||||||
* entity T. Returns entity.id when none of the PIDs meet the selection criteria is available.
|
|
||||||
*
|
|
||||||
* @param entity the entity providing PIDs and a default ID.
|
|
||||||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
|
||||||
* @param md5 indicates whether should hash the PID value or not.
|
|
||||||
* @return an identifier from the most relevant PID, entity.id otherwise
|
|
||||||
*/
|
|
||||||
public static <T extends OafEntity> String createIdentifier(T entity, boolean md5) {
|
|
||||||
|
|
||||||
checkArgument(StringUtils.isNoneBlank(entity.getId()), "missing entity identifier");
|
|
||||||
|
|
||||||
final Map<String, Set<StructuredProperty>> pids = extractPids(entity);
|
|
||||||
|
|
||||||
return pids
|
|
||||||
.values()
|
|
||||||
.stream()
|
|
||||||
.flatMap(s -> s.stream())
|
|
||||||
.min(new PidComparator<>(entity))
|
|
||||||
.map(
|
|
||||||
min -> Optional
|
|
||||||
.ofNullable(pids.get(min.getQualifier().getClassid()))
|
|
||||||
.map(
|
|
||||||
p -> p
|
|
||||||
.stream()
|
|
||||||
.sorted(new PidValueComparator())
|
|
||||||
.findFirst()
|
|
||||||
.map(s -> idFromPid(entity, s, md5))
|
|
||||||
.orElseGet(entity::getId))
|
|
||||||
.orElseGet(entity::getId))
|
|
||||||
.orElseGet(entity::getId);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends OafEntity> Map<String, Set<StructuredProperty>> extractPids(T entity) {
|
|
||||||
if (entity instanceof Result) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(((Result) entity).getInstance())
|
|
||||||
.map(
|
|
||||||
instance -> mapPids(instance))
|
|
||||||
.orElse(new HashMap<>());
|
|
||||||
} else {
|
|
||||||
return entity
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
|
||||||
.filter(CleaningFunctions::pidFilter)
|
|
||||||
.collect(
|
|
||||||
Collectors
|
|
||||||
.groupingBy(
|
|
||||||
p -> p.getQualifier().getClassid(),
|
|
||||||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Map<String, Set<StructuredProperty>> mapPids(List<Instance> instance) {
|
|
||||||
return instance
|
|
||||||
.stream()
|
|
||||||
.map(i -> pidFromInstance(i.getPid(), i.getCollectedfrom(), false))
|
|
||||||
.flatMap(Function.identity())
|
|
||||||
.collect(
|
|
||||||
Collectors
|
|
||||||
.groupingBy(
|
|
||||||
p -> p.getQualifier().getClassid(),
|
|
||||||
Collectors.mapping(p -> p, Collectors.toCollection(HashSet::new))));
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Stream<StructuredProperty> pidFromInstance(List<StructuredProperty> pid, KeyValue collectedFrom,
|
|
||||||
boolean mapHandles) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(pid)
|
|
||||||
.map(
|
|
||||||
pp -> pp
|
|
||||||
.stream()
|
|
||||||
// filter away PIDs provided by a DS that is not considered an authority for the
|
|
||||||
// given PID Type
|
|
||||||
.filter(p -> {
|
|
||||||
return shouldFilterPid(collectedFrom, p, mapHandles);
|
|
||||||
})
|
|
||||||
.map(CleaningFunctions::normalizePidValue)
|
|
||||||
.filter(CleaningFunctions::pidFilter))
|
|
||||||
.orElse(Stream.empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean shouldFilterPid(KeyValue collectedFrom, StructuredProperty p, boolean mapHandles) {
|
|
||||||
final PidType pType = PidType.tryValueOf(p.getQualifier().getClassid());
|
|
||||||
return (mapHandles && pType.equals(PidType.handle)) || Optional.ofNullable(collectedFrom).isPresent() &&
|
|
||||||
Optional
|
|
||||||
.ofNullable(PID_AUTHORITY.get(pType))
|
|
||||||
.map(authorities -> {
|
|
||||||
return authorities.containsKey(collectedFrom.getKey())
|
|
||||||
|| authorities.containsValue(collectedFrom.getValue());
|
|
||||||
})
|
|
||||||
.orElse(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @see {@link IdentifierFactory#createIdentifier(OafEntity, boolean)}
|
|
||||||
*/
|
|
||||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
|
||||||
|
|
||||||
return createIdentifier(entity, true);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s, boolean md5) {
|
|
||||||
return new StringBuilder()
|
|
||||||
.append(ModelSupport.getIdPrefix(entity.getClass()))
|
|
||||||
.append(ID_PREFIX_SEPARATOR)
|
|
||||||
.append(createPrefix(s.getQualifier().getClassid()))
|
|
||||||
.append(ID_SEPARATOR)
|
|
||||||
.append(md5 ? DHPUtils.md5(s.getValue()) : s.getValue())
|
|
||||||
.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
// create the prefix (length = 12)
|
|
||||||
private static String createPrefix(String pidType) {
|
|
||||||
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
|
||||||
while (prefix.length() < ID_PREFIX_LEN) {
|
|
||||||
prefix.append("_");
|
|
||||||
}
|
|
||||||
return prefix.substring(0, ID_PREFIX_LEN);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,38 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class OrganizationPidComparator implements Comparator<StructuredProperty> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
|
||||||
|
|
||||||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
|
||||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.openorgs))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.openorgs))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.GRID))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.GRID))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.mag_id))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.mag_id))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.urn))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.urn))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,8 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.HashSet;
|
|
||||||
|
|
||||||
public class PidBlacklist extends HashMap<String, HashSet<String>> {
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
public class PidBlacklistProvider {
|
|
||||||
|
|
||||||
private static final PidBlacklist blacklist;
|
|
||||||
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
String json = IOUtils.toString(IdentifierFactory.class.getResourceAsStream("pid_blacklist.json"));
|
|
||||||
blacklist = new ObjectMapper().readValue(json, PidBlacklist.class);
|
|
||||||
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static PidBlacklist getBlacklist() {
|
|
||||||
return blacklist;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Set<String> getBlacklist(String pidType) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(getBlacklist().get(pidType))
|
|
||||||
.orElse(new HashSet<>());
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,48 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
|
|
||||||
|
|
||||||
private T entity;
|
|
||||||
|
|
||||||
public PidComparator(T entity) {
|
|
||||||
this.entity = entity;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
|
||||||
|
|
||||||
if (left == null && right == null)
|
|
||||||
return 0;
|
|
||||||
if (left == null)
|
|
||||||
return 1;
|
|
||||||
if (right == null)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
|
||||||
return compareResultPids(left, right);
|
|
||||||
}
|
|
||||||
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
|
||||||
return compareOrganizationtPids(left, right);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Else (but unlikely), lexicographical ordering will do.
|
|
||||||
return left.getQualifier().getClassid().compareTo(right.getQualifier().getClassid());
|
|
||||||
}
|
|
||||||
|
|
||||||
private int compareResultPids(StructuredProperty left, StructuredProperty right) {
|
|
||||||
return new ResultPidComparator().compare(left, right);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int compareOrganizationtPids(StructuredProperty left, StructuredProperty right) {
|
|
||||||
return new OrganizationPidComparator().compare(left, right);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,29 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import org.apache.commons.lang3.EnumUtils;
|
|
||||||
|
|
||||||
public enum PidType {
|
|
||||||
|
|
||||||
// Result
|
|
||||||
doi, pmid, pmc, handle, arXiv, nct, pdb,
|
|
||||||
|
|
||||||
// Organization
|
|
||||||
openorgs, corda, corda_h2020, GRID, mag_id, urn,
|
|
||||||
|
|
||||||
// Used by dedup
|
|
||||||
undefined, original;
|
|
||||||
|
|
||||||
public static boolean isValid(String type) {
|
|
||||||
return EnumUtils.isValidEnum(PidType.class, type);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static PidType tryValueOf(String s) {
|
|
||||||
try {
|
|
||||||
return PidType.valueOf(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return PidType.original;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,33 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
|
|
||||||
public class PidValueComparator implements Comparator<StructuredProperty> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
|
||||||
|
|
||||||
if (left == null && right == null)
|
|
||||||
return 0;
|
|
||||||
if (left == null)
|
|
||||||
return 1;
|
|
||||||
if (right == null)
|
|
||||||
return -1;
|
|
||||||
|
|
||||||
StructuredProperty l = CleaningFunctions.normalizePidValue(left);
|
|
||||||
StructuredProperty r = CleaningFunctions.normalizePidValue(right);
|
|
||||||
|
|
||||||
return Optional
|
|
||||||
.ofNullable(l.getValue())
|
|
||||||
.map(
|
|
||||||
lv -> Optional
|
|
||||||
.ofNullable(r.getValue())
|
|
||||||
.map(rv -> lv.compareTo(rv))
|
|
||||||
.orElse(-1))
|
|
||||||
.orElse(1);
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,53 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class ResultPidComparator implements Comparator<StructuredProperty> {
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
|
||||||
|
|
||||||
PidType lClass = PidType.tryValueOf(left.getQualifier().getClassid());
|
|
||||||
PidType rClass = PidType.tryValueOf(right.getQualifier().getClassid());
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.doi))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.doi))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.pmid))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.pmid))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.pmc))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.pmc))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.handle))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.handle))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.arXiv))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.arXiv))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.nct))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.nct))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
if (lClass.equals(PidType.pdb))
|
|
||||||
return -1;
|
|
||||||
if (rClass.equals(PidType.pdb))
|
|
||||||
return 1;
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,69 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.jetbrains.annotations.NotNull;
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import it.unimi.dsi.fastutil.Hash;
|
|
||||||
|
|
||||||
public class OafMapperUtilsTest {
|
|
||||||
|
|
||||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testMergePubs() throws IOException {
|
|
||||||
Publication p1 = read("publication_1.json", Publication.class);
|
|
||||||
Publication p2 = read("publication_2.json", Publication.class);
|
|
||||||
Dataset d1 = read("dataset_1.json", Dataset.class);
|
|
||||||
Dataset d2 = read("dataset_2.json", Dataset.class);
|
|
||||||
|
|
||||||
assertEquals(p1.getCollectedfrom().size(), 1);
|
|
||||||
assertEquals(p1.getCollectedfrom().get(0).getKey(), ModelConstants.CROSSREF_ID);
|
|
||||||
assertEquals(d2.getCollectedfrom().size(), 1);
|
|
||||||
assertFalse(cfId(d2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
|
||||||
|
|
||||||
assertTrue(
|
|
||||||
OafMapperUtils
|
|
||||||
.mergeResults(p1, d2)
|
|
||||||
.getResulttype()
|
|
||||||
.getClassid()
|
|
||||||
.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID));
|
|
||||||
|
|
||||||
assertEquals(p2.getCollectedfrom().size(), 1);
|
|
||||||
assertFalse(cfId(p2.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
|
||||||
assertEquals(d1.getCollectedfrom().size(), 1);
|
|
||||||
assertTrue(cfId(d1.getCollectedfrom()).contains(ModelConstants.CROSSREF_ID));
|
|
||||||
|
|
||||||
assertTrue(
|
|
||||||
OafMapperUtils
|
|
||||||
.mergeResults(p2, d1)
|
|
||||||
.getResulttype()
|
|
||||||
.getClassid()
|
|
||||||
.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID));
|
|
||||||
}
|
|
||||||
|
|
||||||
@NotNull
|
|
||||||
protected HashSet<String> cfId(List<KeyValue> collectedfrom) {
|
|
||||||
return collectedfrom.stream().map(c -> c.getKey()).collect(Collectors.toCollection(HashSet::new));
|
|
||||||
}
|
|
||||||
|
|
||||||
protected <T extends Result> T read(String filename, Class<T> clazz) throws IOException {
|
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
|
||||||
return OBJECT_MAPPER.readValue(json, clazz);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1,21 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import org.junit.jupiter.api.Assertions;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
public class BlackListProviderTest {
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void blackListTest() {
|
|
||||||
|
|
||||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist());
|
|
||||||
Assertions.assertNotNull(PidBlacklistProvider.getBlacklist().get("doi"));
|
|
||||||
Assertions.assertTrue(PidBlacklistProvider.getBlacklist().get("doi").size() > 0);
|
|
||||||
final Set<String> xxx = PidBlacklistProvider.getBlacklist("xxx");
|
|
||||||
Assertions.assertNotNull(xxx);
|
|
||||||
Assertions.assertEquals(0, xxx.size());
|
|
||||||
}
|
|
||||||
}
|
|
|
@ -1,75 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
|
||||||
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.junit.jupiter.api.Test;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
public class IdentifierFactoryTest {
|
|
||||||
|
|
||||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
|
||||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCreateIdentifierForPublication() throws IOException {
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_doi1.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_doi2.json", "50|doi_________::79dbc7a2a56dc1532659f9038843256e", true);
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_doi3.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_doi4.json", "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66", true);
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", true);
|
|
||||||
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_pmc2.json", "50|pmc_________::94e4cb08c93f8733b48e2445d04002ac", true);
|
|
||||||
|
|
||||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
|
||||||
verifyIdentifier("publication_3.json", defaultID, true);
|
|
||||||
verifyIdentifier("publication_4.json", defaultID, true);
|
|
||||||
verifyIdentifier("publication_5.json", defaultID, true);
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
public void testCreateIdentifierForPublicationNoHash() throws IOException {
|
|
||||||
|
|
||||||
verifyIdentifier("publication_doi1.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
|
|
||||||
verifyIdentifier("publication_doi2.json", "50|doi_________::10.1016/j.cmet.2010.03.013", false);
|
|
||||||
verifyIdentifier("publication_pmc1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
|
|
||||||
verifyIdentifier(
|
|
||||||
"publication_urn1.json", "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", false);
|
|
||||||
|
|
||||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
|
||||||
verifyIdentifier("publication_3.json", defaultID, false);
|
|
||||||
verifyIdentifier("publication_4.json", defaultID, false);
|
|
||||||
verifyIdentifier("publication_5.json", defaultID, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void verifyIdentifier(String filename, String expectedID, boolean md5) throws IOException {
|
|
||||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
|
||||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
|
||||||
|
|
||||||
String id = IdentifierFactory.createIdentifier(pub, md5);
|
|
||||||
|
|
||||||
assertNotNull(id);
|
|
||||||
assertEquals(expectedID, id);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "resuttype" : { "classid" : "dataset" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository B"} ]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2", "value" : "Crossref"} ]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "resuttype" : { "classid" : "publication" }, "pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}], "collectedfrom" : [ { "key" : "10|openaire____::081b82f96300b6a6e3d282bad31cb6e3", "value" : "Repository A"} ]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}
|
|
|
@ -1,33 +0,0 @@
|
||||||
{
|
|
||||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
|
||||||
"instance": [
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
|
|
||||||
"value": "Crossref"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "doi"},
|
|
||||||
"value": "10.1016/j.cmet.2010.03.013"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "urn"},
|
|
||||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "scp-number"},
|
|
||||||
"value": "79953761260"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "pmc"},
|
|
||||||
"value": "21459329"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
{
|
|
||||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
|
||||||
"instance": [
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2",
|
|
||||||
"value": "Crossref"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "doi"},
|
|
||||||
"value": "10.1016/j.cmet.2010.03.013"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
|
||||||
"value": "Europe PubMed Central"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "urn"},
|
|
||||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "scp-number"},
|
|
||||||
"value": "79953761260"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "pmc"},
|
|
||||||
"value": "21459329"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
{
|
|
||||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
|
||||||
"instance": [
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|openaire____::1234",
|
|
||||||
"value": "Zenodo"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "doi"},
|
|
||||||
"value": "10.1016/j.cmet.2010.03.013"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
|
||||||
"value": "Europe PubMed Central"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "urn"},
|
|
||||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "scp-number"},
|
|
||||||
"value": "79953761260"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "pmc"},
|
|
||||||
"value": "21459329"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,37 +0,0 @@
|
||||||
{
|
|
||||||
"id": "50|od______2852::38861c44e6052a8d49f59a4c39ba5e66",
|
|
||||||
"instance": [
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|openaire____::1234",
|
|
||||||
"value": "Zenodo"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "doi"},
|
|
||||||
"value": "10.1016/j.cmet.2010.03.013"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "handle"},
|
|
||||||
"value": "11012/83840"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|opendoar____::2852",
|
|
||||||
"value": "Digital library of Brno University of Technology"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "pmc"},
|
|
||||||
"value": "21459329"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "handle"},
|
|
||||||
"value": "11012/83840"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1 +0,0 @@
|
||||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
|
|
@ -1,21 +0,0 @@
|
||||||
{
|
|
||||||
"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
|
||||||
"instance": [
|
|
||||||
{
|
|
||||||
"collectedfrom": {
|
|
||||||
"key": "10|opendoar____::8b6dd7db9af49e67306feb59a8bdc52c",
|
|
||||||
"value": "Europe PubMed Central"
|
|
||||||
},
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {"classid": "doi"},
|
|
||||||
"value": "10.1016/j.cmet.2010.03.013"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier":{"classid":"pmc"},
|
|
||||||
"value":"21459329"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -1,23 +0,0 @@
|
||||||
{
|
|
||||||
"id": "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f",
|
|
||||||
"pid": [
|
|
||||||
{
|
|
||||||
"qualifier": {
|
|
||||||
"classid": "urn"
|
|
||||||
},
|
|
||||||
"value": "urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {
|
|
||||||
"classid": "scp-number"
|
|
||||||
},
|
|
||||||
"value": "79953761260"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"qualifier": {
|
|
||||||
"classid": "pmcid"
|
|
||||||
},
|
|
||||||
"value": "21459329"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
|
@ -4,10 +4,10 @@ import com.fasterxml.jackson.databind.ObjectMapper
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup
|
||||||
import eu.dnetlib.dhp.schema.action.AtomicAction
|
import eu.dnetlib.dhp.schema.action.AtomicAction
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants
|
import eu.dnetlib.dhp.schema.common.ModelConstants
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory
|
import eu.dnetlib.dhp.schema.oaf.utils.{IdentifierFactory, OafMapperUtils}
|
||||||
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OafMapperUtils, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
import eu.dnetlib.dhp.schema.oaf.{AccessRight, Author, DataInfo, Instance, KeyValue, Oaf, OtherResearchProduct, Publication, Qualifier, Relation, Result, Software, StructuredProperty, Dataset => OafDataset}
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils
|
import eu.dnetlib.dhp.utils.DHPUtils
|
||||||
import org.apache.commons.lang3.StringUtils
|
import org.apache.commons.lang3.StringUtils
|
||||||
import org.json4s.DefaultFormats
|
import org.json4s.DefaultFormats
|
||||||
|
|
|
@ -1,26 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.collection.plugin.rest;
|
package eu.dnetlib.dhp.collection.plugin.rest;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.collection.CollectorException;
|
|
||||||
import eu.dnetlib.dhp.collection.HttpClientParams;
|
|
||||||
import eu.dnetlib.dhp.collection.JsonUtils;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.http.HttpHeaders;
|
|
||||||
import org.apache.http.entity.ContentType;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
import org.w3c.dom.Node;
|
|
||||||
import org.w3c.dom.NodeList;
|
|
||||||
import org.xml.sax.InputSource;
|
|
||||||
|
|
||||||
import javax.xml.transform.OutputKeys;
|
|
||||||
import javax.xml.transform.Transformer;
|
|
||||||
import javax.xml.transform.TransformerConfigurationException;
|
|
||||||
import javax.xml.transform.TransformerFactory;
|
|
||||||
import javax.xml.transform.dom.DOMSource;
|
|
||||||
import javax.xml.transform.stream.StreamResult;
|
|
||||||
import javax.xml.xpath.*;
|
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
import java.io.StringWriter;
|
import java.io.StringWriter;
|
||||||
import java.io.UnsupportedEncodingException;
|
import java.io.UnsupportedEncodingException;
|
||||||
|
@ -32,6 +12,28 @@ import java.util.Iterator;
|
||||||
import java.util.Queue;
|
import java.util.Queue;
|
||||||
import java.util.concurrent.PriorityBlockingQueue;
|
import java.util.concurrent.PriorityBlockingQueue;
|
||||||
|
|
||||||
|
import javax.xml.transform.OutputKeys;
|
||||||
|
import javax.xml.transform.Transformer;
|
||||||
|
import javax.xml.transform.TransformerConfigurationException;
|
||||||
|
import javax.xml.transform.TransformerFactory;
|
||||||
|
import javax.xml.transform.dom.DOMSource;
|
||||||
|
import javax.xml.transform.stream.StreamResult;
|
||||||
|
import javax.xml.xpath.*;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
import org.apache.http.HttpHeaders;
|
||||||
|
import org.apache.http.entity.ContentType;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.w3c.dom.Node;
|
||||||
|
import org.w3c.dom.NodeList;
|
||||||
|
import org.xml.sax.InputSource;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.collection.CollectorException;
|
||||||
|
import eu.dnetlib.dhp.collection.HttpClientParams;
|
||||||
|
import eu.dnetlib.dhp.collection.JsonUtils;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* log.info(...) equal to log.trace(...) in the application-logs
|
* log.info(...) equal to log.trace(...) in the application-logs
|
||||||
* <p>
|
* <p>
|
||||||
|
|
|
@ -32,6 +32,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -2,15 +2,11 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SaveMode;
|
import org.apache.spark.sql.SaveMode;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
@ -18,20 +14,12 @@ import org.dom4j.DocumentException;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import com.sun.media.sound.ModelChannelMixer;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.config.DedupConfig;
|
|
||||||
import net.sf.saxon.ma.trie.Tuple2;
|
|
||||||
|
|
||||||
public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
public class SparkCopyOpenorgsMergeRels extends AbstractSparkAction {
|
||||||
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgsMergeRels.class);
|
private static final Logger log = LoggerFactory.getLogger(SparkCopyOpenorgsMergeRels.class);
|
||||||
|
|
|
@ -15,6 +15,7 @@ import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
|
|
@ -3,7 +3,6 @@ package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import static java.nio.file.Files.createTempDirectory;
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
|
|
||||||
import static org.apache.spark.sql.functions.count;
|
|
||||||
import static org.junit.jupiter.api.Assertions.*;
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
import static org.mockito.Mockito.lenient;
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
@ -23,21 +22,11 @@ import java.util.Properties;
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaPairRDD;
|
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
|
||||||
import org.apache.spark.api.java.function.ForeachFunction;
|
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
|
||||||
import org.apache.spark.api.java.function.PairFunction;
|
|
||||||
import org.apache.spark.sql.Dataset;
|
|
||||||
import org.apache.spark.sql.Encoders;
|
import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.Row;
|
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import org.apache.spark.util.CollectionsUtils;
|
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.*;
|
||||||
import org.junit.jupiter.api.extension.ExtendWith;
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.junit.platform.commons.util.StringUtils;
|
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
@ -46,14 +35,10 @@ import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
import scala.Tuple2;
|
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
|
|
|
@ -18,6 +18,7 @@ import com.google.gson.*;
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
import eu.dnetlib.doiboost.orcidnodoi.util.DumpToActionsUtility;
|
||||||
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
import eu.dnetlib.doiboost.orcidnodoi.util.Pair;
|
||||||
|
|
|
@ -2,12 +2,11 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.CleaningFunctions.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions.*;
|
||||||
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.FilterFunction;
|
import org.apache.spark.api.java.function.FilterFunction;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -19,6 +19,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
|
||||||
public abstract class AbstractMdRecordToOafMapper {
|
public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
|
|
@ -2,15 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
|
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
|
@ -20,8 +20,9 @@ import com.google.common.collect.Lists;
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
import static eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
@ -13,8 +13,8 @@ import org.dom4j.Node;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
|
||||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
|
@ -21,6 +21,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.CleaningFunctions;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
|
|
@ -19,6 +19,7 @@ import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
|
|
@ -30,6 +30,7 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
public class MigrateDbEntitiesApplicationTest {
|
public class MigrateDbEntitiesApplicationTest {
|
||||||
|
|
|
@ -28,6 +28,7 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
|
||||||
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -30,6 +30,7 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntityWrapper;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.ModelHardLimits;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
import scala.collection.JavaConverters;
|
import scala.collection.JavaConverters;
|
||||||
import scala.collection.Seq;
|
import scala.collection.Seq;
|
||||||
|
|
Loading…
Reference in New Issue