forked from D-Net/dnet-hadoop
Merge branch 'stable_ids' into deduptesting
This commit is contained in:
commit
3f2d3253e4
|
@ -29,6 +29,12 @@
|
||||||
<artifactId>spark-sql_2.11</artifactId>
|
<artifactId>spark-sql_2.11</artifactId>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
|
||||||
|
<dependency>
|
||||||
|
<groupId>eu.dnetlib.dhp</groupId>
|
||||||
|
<artifactId>dhp-schemas</artifactId>
|
||||||
|
<version>${project.version}</version>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>commons-cli</groupId>
|
<groupId>commons-cli</groupId>
|
||||||
<artifactId>commons-cli</artifactId>
|
<artifactId>commons-cli</artifactId>
|
||||||
|
|
|
@ -94,7 +94,13 @@ public class AuthorMerger {
|
||||||
if (r.getPid() == null) {
|
if (r.getPid() == null) {
|
||||||
r.setPid(new ArrayList<>());
|
r.setPid(new ArrayList<>());
|
||||||
}
|
}
|
||||||
r.getPid().add(a._1());
|
|
||||||
|
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
|
||||||
|
// it creates of fixed size, and the add method raise UnsupportedOperationException at
|
||||||
|
// java.util.AbstractList.add
|
||||||
|
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
|
||||||
|
tmp.add(a._1());
|
||||||
|
r.setPid(tmp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
|
@ -1,8 +1,9 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.clean;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.util.LinkedHashMap;
|
import java.util.LinkedHashMap;
|
||||||
import java.util.Objects;
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
@ -10,14 +11,13 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.clearspring.analytics.util.Lists;
|
import com.clearspring.analytics.util.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
|
|
||||||
public class CleaningFunctions {
|
public class CleaningFunctions {
|
||||||
|
|
||||||
|
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
|
||||||
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
|
||||||
|
public static final String NONE = "none";
|
||||||
|
|
||||||
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
public static <T extends Oaf> T fixVocabularyNames(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
|
@ -71,7 +71,7 @@ public class CleaningFunctions {
|
||||||
return value;
|
return value;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected static <T extends Oaf> T fixDefaults(T value) {
|
public static <T extends Oaf> T fixDefaults(T value) {
|
||||||
if (value instanceof Datasource) {
|
if (value instanceof Datasource) {
|
||||||
// nothing to clean here
|
// nothing to clean here
|
||||||
} else if (value instanceof Project) {
|
} else if (value instanceof Project) {
|
||||||
|
@ -106,6 +106,20 @@ public class CleaningFunctions {
|
||||||
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
if (Objects.nonNull(r.getPid())) {
|
||||||
|
r
|
||||||
|
.setPid(
|
||||||
|
r
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(Objects::nonNull)
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
|
||||||
|
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
|
||||||
|
.filter(sp -> Objects.nonNull(sp.getQualifier()))
|
||||||
|
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
|
||||||
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
}
|
||||||
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
|
||||||
r
|
r
|
||||||
.setResourcetype(
|
.setResourcetype(
|
||||||
|
@ -125,7 +139,7 @@ public class CleaningFunctions {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
|
||||||
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance());
|
Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
|
||||||
if (Objects.isNull(bestaccessrights)) {
|
if (Objects.isNull(bestaccessrights)) {
|
||||||
r
|
r
|
||||||
.setBestaccessright(
|
.setBestaccessright(
|
||||||
|
@ -201,4 +215,24 @@ public class CleaningFunctions {
|
||||||
classid, classname, scheme, scheme);
|
classid, classname, scheme, scheme);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Utility method that normalises PID values on a per-type basis.
|
||||||
|
* @param pid the PID whose value will be normalised.
|
||||||
|
* @return the PID containing the normalised value.
|
||||||
|
*/
|
||||||
|
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
|
||||||
|
String value = Optional
|
||||||
|
.ofNullable(pid.getValue())
|
||||||
|
.map(String::trim)
|
||||||
|
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
|
||||||
|
switch (pid.getQualifier().getClassid()) {
|
||||||
|
|
||||||
|
// TODO add cleaning for more PID types as needed
|
||||||
|
case "doi":
|
||||||
|
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return pid;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,14 +1,14 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.provision;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
public class ProvisionConstants {
|
public class ModelHardLimits {
|
||||||
|
|
||||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||||
public static final int MAX_AUTHORS = 200;
|
public static final int MAX_AUTHORS = 200;
|
||||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||||
public static final int MAX_TITLE_LENGTH = 5000;
|
public static final int MAX_TITLE_LENGTH = 5000;
|
||||||
public static final int MAX_TITLES = 10;
|
public static final int MAX_TITLES = 10;
|
||||||
public static final int MAX_ABSTRACT_LENGTH = 100000;
|
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||||
public static final int MAX_INSTANCES = 10;
|
public static final int MAX_INSTANCES = 10;
|
||||||
|
|
||||||
}
|
}
|
|
@ -1,11 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
import java.util.Arrays;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
import java.util.*;
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
import java.util.concurrent.ConcurrentHashMap;
|
||||||
import java.util.function.Function;
|
import java.util.function.Function;
|
||||||
import java.util.function.Predicate;
|
import java.util.function.Predicate;
|
||||||
|
@ -13,15 +12,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class OafMapperUtils {
|
public class OafMapperUtils {
|
||||||
|
@ -270,4 +261,36 @@ public class OafMapperUtils {
|
||||||
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
|
||||||
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
||||||
|
return getBestAccessRights(instanceList);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
||||||
|
if (instanceList != null) {
|
||||||
|
final Optional<Qualifier> min = instanceList
|
||||||
|
.stream()
|
||||||
|
.map(i -> i.getAccessright())
|
||||||
|
.min(new LicenseComparator());
|
||||||
|
|
||||||
|
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
|
||||||
|
|
||||||
|
if (StringUtils.isBlank(rights.getClassid())) {
|
||||||
|
rights.setClassid(UNKNOWN);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getClassname())
|
||||||
|
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
||||||
|
rights.setClassname(NOT_AVAILABLE);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getSchemeid())) {
|
||||||
|
rights.setSchemeid(DNET_ACCESS_MODES);
|
||||||
|
}
|
||||||
|
if (StringUtils.isBlank(rights.getSchemename())) {
|
||||||
|
rights.setSchemename(DNET_ACCESS_MODES);
|
||||||
|
}
|
||||||
|
|
||||||
|
return rights;
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
|
||||||
|
public class ResultTypeComparator implements Comparator<Result> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(Result left, Result right) {
|
||||||
|
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return 1;
|
||||||
|
if (right == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
String lClass = left.getResulttype().getClassid();
|
||||||
|
String rClass = right.getResulttype().getClassid();
|
||||||
|
|
||||||
|
if (lClass.equals(rClass))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||||
|
return -1;
|
||||||
|
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
|
return lClass.compareTo(rClass);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,102 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Objects;
|
||||||
|
import java.util.Optional;
|
||||||
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
import org.apache.commons.lang.StringUtils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory class for OpenAIRE identifiers in the Graph
|
||||||
|
*/
|
||||||
|
public class IdentifierFactory implements Serializable {
|
||||||
|
|
||||||
|
public static final String ID_SEPARATOR = "::";
|
||||||
|
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||||
|
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
||||||
|
+ "[a-zA-Z0-9]{32}$";
|
||||||
|
|
||||||
|
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
|
||||||
|
"(^10\\.1002\\/[^\\s]+$)|" +
|
||||||
|
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
|
||||||
|
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
|
||||||
|
|
||||||
|
public static final int ID_PREFIX_LEN = 12;
|
||||||
|
public static final String NONE = "none";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
|
||||||
|
* when no PID is available
|
||||||
|
* @param entity the entity providing PIDs and a default ID.
|
||||||
|
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
||||||
|
* @return an identifier from the most relevant PID, entity.id otherwise
|
||||||
|
*/
|
||||||
|
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||||
|
|
||||||
|
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
||||||
|
return entity.getId();
|
||||||
|
}
|
||||||
|
|
||||||
|
return entity
|
||||||
|
.getPid()
|
||||||
|
.stream()
|
||||||
|
.filter(s -> pidFilter(s))
|
||||||
|
.min(new PidComparator<>(entity))
|
||||||
|
.map(s -> idFromPid(entity, s))
|
||||||
|
.map(IdentifierFactory::verifyIdSyntax)
|
||||||
|
.orElseGet(entity::getId);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static boolean pidFilter(StructuredProperty s) {
|
||||||
|
if (Objects.isNull(s.getQualifier()) ||
|
||||||
|
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
switch (PidType.valueOf(s.getQualifier().getClassid())) {
|
||||||
|
case doi:
|
||||||
|
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
|
||||||
|
return doi.matches(DOI_REGEX);
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
} catch (IllegalArgumentException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String verifyIdSyntax(String s) {
|
||||||
|
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||||
|
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||||
|
} else {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
||||||
|
return new StringBuilder()
|
||||||
|
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||||
|
.append(ID_PREFIX_SEPARATOR)
|
||||||
|
.append(createPrefix(s.getQualifier().getClassid()))
|
||||||
|
.append(ID_SEPARATOR)
|
||||||
|
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
|
||||||
|
.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the prefix (length = 12)
|
||||||
|
private static String createPrefix(String pidType) {
|
||||||
|
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
||||||
|
while (prefix.length() < ID_PREFIX_LEN) {
|
||||||
|
prefix.append("_");
|
||||||
|
}
|
||||||
|
return prefix.substring(0, ID_PREFIX_LEN);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,27 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class OrganizationPidComparator implements Comparator<PidType> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(PidType pLeft, PidType pRight) {
|
||||||
|
if (pLeft.equals(PidType.GRID))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.GRID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.mag_id))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.mag_id))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.urn))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.urn))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,54 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||||
|
|
||||||
|
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
|
||||||
|
|
||||||
|
private T entity;
|
||||||
|
|
||||||
|
public PidComparator(T entity) {
|
||||||
|
this.entity = entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||||
|
|
||||||
|
if (left == null && right == null)
|
||||||
|
return 0;
|
||||||
|
if (left == null)
|
||||||
|
return 1;
|
||||||
|
if (right == null)
|
||||||
|
return -1;
|
||||||
|
|
||||||
|
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||||
|
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||||
|
|
||||||
|
if (lClass.equals(rClass))
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||||
|
return compareResultPids(lClass, rClass);
|
||||||
|
}
|
||||||
|
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
||||||
|
return compareOrganizationtPids(lClass, rClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Else (but unlikely), lexicographical ordering will do.
|
||||||
|
return lClass.compareTo(rClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||||
|
return new ResultPidComparator().compare(lClass, rClass);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||||
|
return new OrganizationPidComparator().compare(lClass, rClass);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,29 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.EnumUtils;
|
||||||
|
|
||||||
|
public enum PidType {
|
||||||
|
|
||||||
|
// Result
|
||||||
|
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||||
|
|
||||||
|
// Organization
|
||||||
|
GRID, mag_id, urn,
|
||||||
|
|
||||||
|
// Used by dedup
|
||||||
|
undefined, original;
|
||||||
|
|
||||||
|
public static boolean isValid(String type) {
|
||||||
|
return EnumUtils.isValidEnum(PidType.class, type);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static PidType tryValueOf(String s) {
|
||||||
|
try {
|
||||||
|
return PidType.valueOf(s);
|
||||||
|
} catch (Exception e) {
|
||||||
|
return PidType.original;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import java.util.Comparator;
|
||||||
|
|
||||||
|
public class ResultPidComparator implements Comparator<PidType> {
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public int compare(PidType pLeft, PidType pRight) {
|
||||||
|
if (pLeft.equals(PidType.doi))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.doi))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.pmid))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.pmid))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.pmc))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.pmc))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.handle))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.handle))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.arXiv))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.arXiv))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.NCID))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.NCID))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.GBIF))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.GBIF))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.nct))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.nct))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
if (pLeft.equals(PidType.urn))
|
||||||
|
return -1;
|
||||||
|
if (pRight.equals(PidType.urn))
|
||||||
|
return 1;
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,47 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||||
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
|
public class IdentifierFactoryTest {
|
||||||
|
|
||||||
|
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCreateIdentifierForPublication() throws IOException {
|
||||||
|
|
||||||
|
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||||
|
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||||
|
verifyIdentifier(
|
||||||
|
"publication_urn.json",
|
||||||
|
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||||
|
|
||||||
|
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||||
|
verifyIdentifier("publication_3.json", defaultID);
|
||||||
|
verifyIdentifier("publication_4.json", defaultID);
|
||||||
|
verifyIdentifier("publication_5.json", defaultID);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
||||||
|
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||||
|
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||||
|
|
||||||
|
String id = IdentifierFactory.createIdentifier(pub);
|
||||||
|
|
||||||
|
assertNotNull(id);
|
||||||
|
assertEquals(expectedID, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
|
@ -0,0 +1 @@
|
||||||
|
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]}
|
|
@ -39,15 +39,15 @@ public class ModelConstants {
|
||||||
public static final String IS_SUPPLEMENT_TO = "isSupplementTo";
|
public static final String IS_SUPPLEMENT_TO = "isSupplementTo";
|
||||||
public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy";
|
public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy";
|
||||||
public static final String PART = "part";
|
public static final String PART = "part";
|
||||||
public static final String IS_PART_OF = "IsPartOf";
|
public static final String IS_PART_OF = "isPartOf";
|
||||||
public static final String HAS_PARTS = "HasParts";
|
public static final String HAS_PARTS = "hasParts";
|
||||||
public static final String RELATIONSHIP = "relationship";
|
public static final String RELATIONSHIP = "relationship";
|
||||||
public static final String CITATION = "citation";
|
public static final String CITATION = "citation";
|
||||||
public static final String CITES = "cites";
|
public static final String CITES = "cites";
|
||||||
public static final String IS_CITED_BY = "IsCitedBy";
|
public static final String IS_CITED_BY = "isCitedBy";
|
||||||
public static final String REVIEW = "review";
|
public static final String REVIEW = "review";
|
||||||
public static final String REVIEWS = "reviews";
|
public static final String REVIEWS = "reviews";
|
||||||
public static final String IS_REVIEWED_BY = "IsReviewedBy";
|
public static final String IS_REVIEWED_BY = "isReviewedBy";
|
||||||
|
|
||||||
public static final String RESULT_PROJECT = "resultProject";
|
public static final String RESULT_PROJECT = "resultProject";
|
||||||
public static final String OUTCOME = "outcome";
|
public static final String OUTCOME = "outcome";
|
||||||
|
|
|
@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
|
@ -11,6 +12,7 @@ import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.*;
|
import org.apache.spark.sql.*;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
|
@ -175,43 +177,54 @@ public class PrepareProgramme {
|
||||||
return csvProgramme;
|
return csvProgramme;
|
||||||
});
|
});
|
||||||
|
|
||||||
prepareClassification(h2020Programmes);
|
// prepareClassification(h2020Programmes);
|
||||||
|
|
||||||
h2020Programmes
|
JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
|
||||||
.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
|
|
||||||
|
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
|
||||||
|
rdd
|
||||||
|
.map(csvProgramme -> {
|
||||||
|
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
|
||||||
|
return tmp;
|
||||||
|
})
|
||||||
.saveAsTextFile(outputPath);
|
.saveAsTextFile(outputPath);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
|
||||||
Object[] codedescription = h2020Programmes
|
Object[] codedescription = h2020Programmes
|
||||||
.map(value -> new Tuple2<>(value.getCode(), value.getTitle()))
|
.map(
|
||||||
|
value -> new Tuple2<>(value.getCode(),
|
||||||
|
new Tuple2<String, String>(value.getTitle(), value.getShortTitle())))
|
||||||
.collect()
|
.collect()
|
||||||
.toArray();
|
.toArray();
|
||||||
|
|
||||||
for (int i = 0; i < codedescription.length - 1; i++) {
|
for (int i = 0; i < codedescription.length - 1; i++) {
|
||||||
for (int j = i + 1; j < codedescription.length; j++) {
|
for (int j = i + 1; j < codedescription.length; j++) {
|
||||||
Tuple2<String, String> t2i = (Tuple2<String, String>) codedescription[i];
|
Tuple2<String, Tuple2<String, String>> t2i = (Tuple2<String, Tuple2<String, String>>) codedescription[i];
|
||||||
Tuple2<String, String> t2j = (Tuple2<String, String>) codedescription[j];
|
Tuple2<String, Tuple2<String, String>> t2j = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
|
||||||
if (t2i._1().compareTo(t2j._1()) > 0) {
|
if (t2i._1().compareTo(t2j._1()) > 0) {
|
||||||
Tuple2<String, String> temp = t2i;
|
Tuple2<String, Tuple2<String, String>> temp = t2i;
|
||||||
codedescription[i] = t2j;
|
codedescription[i] = t2j;
|
||||||
codedescription[j] = temp;
|
codedescription[j] = temp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Map<String, String> map = new HashMap<>();
|
Map<String, Tuple2<String, String>> map = new HashMap<>();
|
||||||
for (int j = 0; j < codedescription.length; j++) {
|
for (int j = 0; j < codedescription.length; j++) {
|
||||||
Tuple2<String, String> entry = (Tuple2<String, String>) codedescription[j];
|
Tuple2<String, Tuple2<String, String>> entry = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
|
||||||
String ent = entry._1();
|
String ent = entry._1();
|
||||||
if (ent.contains("Euratom-")) {
|
if (ent.contains("Euratom-")) {
|
||||||
ent = ent.replace("-Euratom-", ".Euratom.");
|
ent = ent.replace("-Euratom-", ".Euratom.");
|
||||||
}
|
}
|
||||||
String[] tmp = ent.split("\\.");
|
String[] tmp = ent.split("\\.");
|
||||||
if (tmp.length <= 2) {
|
if (tmp.length <= 2) {
|
||||||
|
if (StringUtils.isEmpty(entry._2()._2())) {
|
||||||
|
map.put(entry._1(), new Tuple2<String, String>(entry._2()._1(), entry._2()._1()));
|
||||||
|
} else {
|
||||||
map.put(entry._1(), entry._2());
|
map.put(entry._1(), entry._2());
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
if (ent.endsWith(".")) {
|
if (ent.endsWith(".")) {
|
||||||
ent = ent.substring(0, ent.length() - 1);
|
ent = ent.substring(0, ent.length() - 1);
|
||||||
|
@ -224,14 +237,14 @@ public class PrepareProgramme {
|
||||||
key = key.substring(0, key.length() - 1);
|
key = key.substring(0, key.length() - 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
String current = entry._2();
|
String current = entry._2()._1();
|
||||||
if (!ent.contains("Euratom")) {
|
if (!ent.contains("Euratom")) {
|
||||||
|
|
||||||
String parent;
|
String parent;
|
||||||
String tmp_key = tmp[0] + ".";
|
String tmp_key = tmp[0] + ".";
|
||||||
for (int i = 1; i < tmp.length - 1; i++) {
|
for (int i = 1; i < tmp.length - 1; i++) {
|
||||||
tmp_key += tmp[i] + ".";
|
tmp_key += tmp[i] + ".";
|
||||||
parent = map.get(tmp_key).toLowerCase().trim();
|
parent = map.get(tmp_key)._1().toLowerCase().trim();
|
||||||
if (parent.contains("|")) {
|
if (parent.contains("|")) {
|
||||||
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
|
parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
|
||||||
}
|
}
|
||||||
|
@ -246,18 +259,29 @@ public class PrepareProgramme {
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
map.put(ent + ".", map.get(key) + " | " + current);
|
String shortTitle = entry._2()._2();
|
||||||
|
if (StringUtils.isEmpty(shortTitle)) {
|
||||||
|
shortTitle = current;
|
||||||
|
}
|
||||||
|
Tuple2<String, String> newEntry = new Tuple2<>(map.get(key)._1() + " | " + current,
|
||||||
|
map.get(key)._2() + " | " + shortTitle);
|
||||||
|
map.put(ent + ".", newEntry);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
h2020Programmes.foreach(csvProgramme -> {
|
return h2020Programmes.map(csvProgramme -> {
|
||||||
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
|
|
||||||
&& !csvProgramme.getCode().equals("H2020-EC"))
|
String code = csvProgramme.getCode();
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode() + "."));
|
if (!code.endsWith(".") && !code.contains("Euratom")
|
||||||
else
|
&& !code.equals("H2020-EC"))
|
||||||
csvProgramme.setClassification(map.get(csvProgramme.getCode()));
|
code += ".";
|
||||||
});
|
|
||||||
|
csvProgramme.setClassification(map.get(code)._1());
|
||||||
|
csvProgramme.setClassification_short(map.get(code)._2());
|
||||||
|
|
||||||
|
return csvProgramme;
|
||||||
|
}).collect();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
|
|
|
@ -9,7 +9,6 @@ import java.util.Objects;
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -138,7 +137,8 @@ public class SparkAtomicActionJob {
|
||||||
pm.setCode(csvProject.getProgramme());
|
pm.setCode(csvProject.getProgramme());
|
||||||
h2020classification.setClassification(ocsvProgramme.get().getClassification());
|
h2020classification.setClassification(ocsvProgramme.get().getClassification());
|
||||||
h2020classification.setH2020Programme(pm);
|
h2020classification.setH2020Programme(pm);
|
||||||
setLevelsAndProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
|
||||||
|
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
|
||||||
pp.setH2020classification(Arrays.asList(h2020classification));
|
pp.setH2020classification(Arrays.asList(h2020classification));
|
||||||
|
|
||||||
return pp;
|
return pp;
|
||||||
|
@ -177,8 +177,8 @@ public class SparkAtomicActionJob {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private static void setLevelsAndProgramme(H2020Classification h2020Classification, String classification) {
|
private static void setLevelsandProgramme(H2020Classification h2020Classification, String classification_short) {
|
||||||
String[] tmp = classification.split(" \\| ");
|
String[] tmp = classification_short.split(" \\| ");
|
||||||
h2020Classification.setLevel1(tmp[0]);
|
h2020Classification.setLevel1(tmp[0]);
|
||||||
if (tmp.length > 1) {
|
if (tmp.length > 1) {
|
||||||
h2020Classification.setLevel2(tmp[1]);
|
h2020Classification.setLevel2(tmp[1]);
|
||||||
|
@ -189,6 +189,12 @@ public class SparkAtomicActionJob {
|
||||||
h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
|
h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// private static void setProgramme(H2020Classification h2020Classification, String classification) {
|
||||||
|
// String[] tmp = classification.split(" \\| ");
|
||||||
|
//
|
||||||
|
// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
|
||||||
|
// }
|
||||||
|
|
||||||
public static <R> Dataset<R> readPath(
|
public static <R> Dataset<R> readPath(
|
||||||
SparkSession spark, String inputPath, Class<R> clazz) {
|
SparkSession spark, String inputPath, Class<R> clazz) {
|
||||||
return spark
|
return spark
|
||||||
|
|
|
@ -22,6 +22,15 @@ public class CSVProgramme implements Serializable {
|
||||||
private String shortTitle;
|
private String shortTitle;
|
||||||
private String language;
|
private String language;
|
||||||
private String classification;
|
private String classification;
|
||||||
|
private String classification_short;
|
||||||
|
|
||||||
|
public String getClassification_short() {
|
||||||
|
return classification_short;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setClassification_short(String classification_short) {
|
||||||
|
this.classification_short = classification_short;
|
||||||
|
}
|
||||||
|
|
||||||
public String getClassification() {
|
public String getClassification() {
|
||||||
return classification;
|
return classification;
|
||||||
|
|
|
@ -9,12 +9,14 @@ import java.util.List;
|
||||||
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
|
||||||
import org.junit.jupiter.api.Assertions;
|
import org.junit.jupiter.api.Assertions;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
|
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
|
||||||
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
|
||||||
|
|
||||||
|
@Disabled
|
||||||
public class EXCELParserTest {
|
public class EXCELParserTest {
|
||||||
|
|
||||||
private static Path workingDir;
|
private static Path workingDir;
|
||||||
|
|
|
@ -92,6 +92,8 @@ public class PrepareH2020ProgrammeTest {
|
||||||
|
|
||||||
Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count());
|
Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count());
|
||||||
|
|
||||||
|
// tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme)));
|
||||||
|
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft",
|
"Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft",
|
||||||
|
|
|
@ -78,7 +78,7 @@ public class SparkUpdateProjectTest {
|
||||||
"-programmePath",
|
"-programmePath",
|
||||||
getClass()
|
getClass()
|
||||||
.getResource(
|
.getResource(
|
||||||
"/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz")
|
"/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz")
|
||||||
.getPath(),
|
.getPath(),
|
||||||
"-projectPath",
|
"-projectPath",
|
||||||
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(),
|
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(),
|
||||||
|
@ -124,7 +124,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Societal challenges",
|
"Societal Challenges",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
|
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
|
||||||
.select("classification.level1")
|
.select("classification.level1")
|
||||||
|
@ -133,7 +133,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Smart, Green And Integrated Transport",
|
"Transport",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
|
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
|
||||||
.select("classification.level2")
|
.select("classification.level2")
|
||||||
|
@ -188,7 +188,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Nurturing excellence by means of cross-border and cross-sector mobility",
|
"MSCA Mobility",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
||||||
.select("classification.h2020Programme.description")
|
.select("classification.h2020Programme.description")
|
||||||
|
@ -197,7 +197,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Excellent science",
|
"Excellent Science",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
||||||
.select("classification.level1")
|
.select("classification.level1")
|
||||||
|
@ -206,7 +206,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Marie Skłodowska-Curie Actions",
|
"Marie-Sklodowska-Curie Actions",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
||||||
.select("classification.level2")
|
.select("classification.level2")
|
||||||
|
@ -215,7 +215,7 @@ public class SparkUpdateProjectTest {
|
||||||
.getString(0));
|
.getString(0));
|
||||||
Assertions
|
Assertions
|
||||||
.assertEquals(
|
.assertEquals(
|
||||||
"Nurturing excellence by means of cross-border and cross-sector mobility",
|
"MSCA Mobility",
|
||||||
execverification
|
execverification
|
||||||
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
|
||||||
.select("classification.level3")
|
.select("classification.level3")
|
||||||
|
|
|
@ -6,8 +6,10 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
|
||||||
import org.apache.http.ssl.SSLContextBuilder;
|
import org.apache.http.ssl.SSLContextBuilder;
|
||||||
import org.junit.jupiter.api.BeforeAll;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
|
import org.junit.jupiter.api.Disabled;
|
||||||
import org.junit.jupiter.api.Test;
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
@Disabled
|
||||||
public class HttpConnectorTest {
|
public class HttpConnectorTest {
|
||||||
|
|
||||||
private static final Log log = LogFactory.getLog(HttpConnectorTest.class);
|
private static final Log log = LogFactory.getLog(HttpConnectorTest.class);
|
||||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -82,7 +82,7 @@ public class DedupRecordFactory {
|
||||||
|
|
||||||
final Collection<String> dates = Lists.newArrayList();
|
final Collection<String> dates = Lists.newArrayList();
|
||||||
final List<List<Author>> authors = Lists.newArrayList();
|
final List<List<Author>> authors = Lists.newArrayList();
|
||||||
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list
|
final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
|
||||||
|
|
||||||
entities
|
entities
|
||||||
.forEachRemaining(
|
.forEachRemaining(
|
||||||
|
@ -90,7 +90,7 @@ public class DedupRecordFactory {
|
||||||
T duplicate = t._2();
|
T duplicate = t._2();
|
||||||
|
|
||||||
// prepare the list of pids to use for the id generation
|
// prepare the list of pids to use for the id generation
|
||||||
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate));
|
bestPids.add(Identifier.newInstance(duplicate));
|
||||||
|
|
||||||
entity.mergeFrom(duplicate);
|
entity.mergeFrom(duplicate);
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||||
|
|
|
@ -1,124 +1,46 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
|
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
||||||
|
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.text.ParseException;
|
import java.util.List;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.*;
|
|
||||||
|
|
||||||
import org.apache.commons.lang.StringUtils;
|
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
|
||||||
|
|
||||||
public class IdGenerator implements Serializable {
|
public class IdGenerator implements Serializable {
|
||||||
|
|
||||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
|
||||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
|
||||||
public static String BASE_DATE = "2000-01-01";
|
|
||||||
|
|
||||||
// pick the best pid from the list (consider date and pidtype)
|
// pick the best pid from the list (consider date and pidtype)
|
||||||
public static String generate(List<Identifier> pids, String defaultID) {
|
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
|
||||||
if (pids == null || pids.size() == 0)
|
if (pids == null || pids.size() == 0)
|
||||||
return defaultID;
|
return defaultID;
|
||||||
|
|
||||||
Optional<Identifier> bp = pids
|
Identifier<T> bp = pids
|
||||||
.stream()
|
.stream()
|
||||||
.max(Identifier::compareTo);
|
.min(Identifier::compareTo)
|
||||||
|
.get();
|
||||||
|
|
||||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
String prefix = substringBefore(bp.getOriginalID(), "|");
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
|
||||||
+ DHPUtils.md5(bp.get().getOriginalID());
|
String suffix = substringAfter(bp.getOriginalID(), "::");
|
||||||
|
|
||||||
|
final String pidType = substringBefore(ns, "_");
|
||||||
|
if (PidType.isValid(pidType)) {
|
||||||
|
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
||||||
} else {
|
} else {
|
||||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
return prefix + "|dedup_wf_001::" + suffix;
|
||||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
}
|
||||||
+ DHPUtils.md5(bp.get().getPid().getValue());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
private static String dedupify(String ns) {
|
||||||
|
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||||
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
|
||||||
|
|
||||||
Date date;
|
|
||||||
try {
|
|
||||||
date = sdf.parse(BASE_DATE);
|
|
||||||
} catch (ParseException e) {
|
|
||||||
date = new Date();
|
|
||||||
}
|
|
||||||
return Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
|
||||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
|
||||||
}
|
|
||||||
|
|
||||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
|
||||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
|
||||||
|
|
||||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
|
||||||
|
|
||||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
|
||||||
return createBasePid(entity, sdf);
|
|
||||||
|
|
||||||
Optional<StructuredProperty> bp = entity
|
|
||||||
.getPid()
|
|
||||||
.stream()
|
|
||||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
|
||||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
|
||||||
|
|
||||||
return bp
|
|
||||||
.map(
|
|
||||||
structuredProperty -> Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(structuredProperty, extractDate(entity, sdf),
|
|
||||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
|
||||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
|
||||||
.orElseGet(() -> createBasePid(entity, sdf));
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// create the prefix (length = 12): dedup_+ pidType
|
|
||||||
public static String createPrefix(String pidType) {
|
|
||||||
|
|
||||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
|
||||||
|
|
||||||
while (prefix.length() < 12) {
|
while (prefix.length() < 12) {
|
||||||
prefix.append("_");
|
prefix.append("_");
|
||||||
}
|
}
|
||||||
return prefix.toString().substring(0, 12);
|
return prefix.substring(0, 12);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
|
|
||||||
// 00-01-01
|
|
||||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
|
||||||
|
|
||||||
String date = BASE_DATE;
|
|
||||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
|
||||||
Result result = (Result) duplicate;
|
|
||||||
if (isWellformed(result.getDateofacceptance())) {
|
|
||||||
date = result.getDateofacceptance().getValue();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return sdf.parse(date);
|
|
||||||
} catch (ParseException e) {
|
|
||||||
return new Date();
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean isWellformed(Field<String> date) {
|
|
||||||
return date != null && StringUtils.isNotBlank(date.getValue())
|
|
||||||
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -2,93 +2,85 @@
|
||||||
package eu.dnetlib.dhp.oa.dedup.model;
|
package eu.dnetlib.dhp.oa.dedup.model;
|
||||||
|
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
|
import java.text.ParseException;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
import java.util.Date;
|
import java.util.Date;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.Optional;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Sets;
|
import com.google.common.collect.Sets;
|
||||||
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
|
|
||||||
|
import eu.dnetlib.dhp.oa.dedup.DatePicker;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||||
|
|
||||||
public class Identifier implements Serializable, Comparable<Identifier> {
|
public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
|
||||||
|
|
||||||
StructuredProperty pid;
|
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||||
Date date;
|
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||||
PidType type;
|
public static String BASE_DATE = "2000-01-01";
|
||||||
List<KeyValue> collectedFrom;
|
|
||||||
EntityType entityType;
|
|
||||||
String originalID;
|
|
||||||
|
|
||||||
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original
|
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||||
// ID
|
|
||||||
|
|
||||||
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom,
|
private T entity;
|
||||||
EntityType entityType, String originalID) {
|
|
||||||
this.pid = pid;
|
public static <T extends OafEntity> Identifier newInstance(T entity) {
|
||||||
this.date = date;
|
return new Identifier(entity);
|
||||||
this.type = type;
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
this.entityType = entityType;
|
|
||||||
this.originalID = originalID;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public StructuredProperty getPid() {
|
public Identifier(T entity) {
|
||||||
return pid;
|
this.entity = entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setPid(StructuredProperty pid) {
|
public T getEntity() {
|
||||||
this.pid = pid;
|
return entity;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setEntity(T entity) {
|
||||||
|
this.entity = entity;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Date getDate() {
|
public Date getDate() {
|
||||||
return date;
|
String date = BASE_DATE;
|
||||||
|
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
|
||||||
|
Result result = (Result) getEntity();
|
||||||
|
if (isWellformed(result.getDateofacceptance())) {
|
||||||
|
date = result.getDateofacceptance().getValue();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
return sdf.parse(date);
|
||||||
|
} catch (ParseException e) {
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setDate(Date date) {
|
private static boolean isWellformed(Field<String> date) {
|
||||||
this.date = date;
|
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||||
}
|
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||||
|
|
||||||
public PidType getType() {
|
|
||||||
return type;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setType(PidType type) {
|
|
||||||
this.type = type;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<KeyValue> getCollectedFrom() {
|
public List<KeyValue> getCollectedFrom() {
|
||||||
return collectedFrom;
|
return entity.getCollectedfrom();
|
||||||
}
|
|
||||||
|
|
||||||
public void setCollectedFrom(List<KeyValue> collectedFrom) {
|
|
||||||
this.collectedFrom = collectedFrom;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public EntityType getEntityType() {
|
public EntityType getEntityType() {
|
||||||
return entityType;
|
return EntityType.fromClass(entity.getClass());
|
||||||
}
|
|
||||||
|
|
||||||
public void setEntityType(EntityType entityType) {
|
|
||||||
this.entityType = entityType;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getOriginalID() {
|
public String getOriginalID() {
|
||||||
return originalID;
|
return entity.getId();
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setOriginalID(String originalID) {
|
private PidType getPidType() {
|
||||||
this.originalID = originalID;
|
return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isUseOriginal() {
|
|
||||||
return useOriginal;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setUseOriginal(boolean useOriginal) {
|
|
||||||
this.useOriginal = useOriginal;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -96,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
|
||||||
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
|
||||||
// alphabetical order of the originalID
|
// alphabetical order of the originalID
|
||||||
|
|
||||||
Set<String> lKeys = Sets.newHashSet();
|
Set<String> lKeys = Optional
|
||||||
if (this.collectedFrom != null)
|
.ofNullable(getCollectedFrom())
|
||||||
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||||
|
.orElse(Sets.newHashSet());
|
||||||
|
|
||||||
Set<String> rKeys = Sets.newHashSet();
|
final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
|
||||||
if (i.getCollectedFrom() != null)
|
Set<String> rKeys = cf
|
||||||
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet());
|
.map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
|
||||||
|
.orElse(Sets.newHashSet());
|
||||||
|
|
||||||
if (this.getType().compareTo(i.getType()) == 0) { // same type
|
if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
|
||||||
if (entityType == EntityType.publication) {
|
if (getEntityType() == EntityType.publication) {
|
||||||
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID)
|
if (isFromDatasourceID(lKeys, CROSSREF_ID)
|
||||||
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID))
|
&& !isFromDatasourceID(rKeys, CROSSREF_ID))
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
|
|
||||||
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
|
|
||||||
return -1;
|
return -1;
|
||||||
|
if (isFromDatasourceID(rKeys, CROSSREF_ID)
|
||||||
|
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
if (entityType == EntityType.dataset) {
|
if (getEntityType() == EntityType.dataset) {
|
||||||
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID)
|
if (isFromDatasourceID(lKeys, DATACITE_ID)
|
||||||
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID))
|
&& !isFromDatasourceID(rKeys, DATACITE_ID))
|
||||||
return 1;
|
|
||||||
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
|
|
||||||
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
|
|
||||||
return -1;
|
return -1;
|
||||||
|
if (isFromDatasourceID(rKeys, DATACITE_ID)
|
||||||
|
&& !isFromDatasourceID(lKeys, DATACITE_ID))
|
||||||
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
if (this.getDate().compareTo(i.getDate()) == 0) {// same date
|
||||||
|
|
||||||
if (this.originalID.compareTo(i.originalID) < 0)
|
|
||||||
this.useOriginal = true;
|
|
||||||
else
|
|
||||||
i.setUseOriginal(true);
|
|
||||||
|
|
||||||
// the minus because we need to take the alphabetically lower id
|
// the minus because we need to take the alphabetically lower id
|
||||||
return -this.originalID.compareTo(i.originalID);
|
return this.getOriginalID().compareTo(i.getOriginalID());
|
||||||
} else
|
} else
|
||||||
// the minus is because we need to take the elder date
|
// the minus is because we need to take the elder date
|
||||||
return -this.getDate().compareTo(i.getDate());
|
return this.getDate().compareTo(i.getDate());
|
||||||
} else {
|
} else {
|
||||||
return this.getType().compareTo(i.getType());
|
return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private StructuredProperty toSP(PidType pidType) {
|
||||||
|
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
|
||||||
|
}
|
||||||
|
|
||||||
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
|
||||||
return collectedFrom.contains(dsId);
|
return collectedFrom.contains(dsId);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,17 +0,0 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup.model;
|
|
||||||
|
|
||||||
public enum PidType {
|
|
||||||
|
|
||||||
// from the less to the more important
|
|
||||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
|
||||||
|
|
||||||
public static PidType classidValueOf(String s) {
|
|
||||||
try {
|
|
||||||
return PidType.valueOf(s);
|
|
||||||
} catch (Exception e) {
|
|
||||||
return PidType.undefined;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
|
@ -9,6 +9,7 @@ import java.io.IOException;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.codehaus.jackson.map.ObjectMapper;
|
||||||
import org.junit.jupiter.api.BeforeEach;
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
@ -21,16 +22,16 @@ import scala.Tuple2;
|
||||||
|
|
||||||
public class EntityMergerTest implements Serializable {
|
public class EntityMergerTest implements Serializable {
|
||||||
|
|
||||||
List<Tuple2<String, Publication>> publications;
|
private List<Tuple2<String, Publication>> publications;
|
||||||
List<Tuple2<String, Publication>> publications2;
|
private List<Tuple2<String, Publication>> publications2;
|
||||||
List<Tuple2<String, Publication>> publications3;
|
private List<Tuple2<String, Publication>> publications3;
|
||||||
List<Tuple2<String, Publication>> publications4;
|
private List<Tuple2<String, Publication>> publications4;
|
||||||
List<Tuple2<String, Publication>> publications5;
|
private List<Tuple2<String, Publication>> publications5;
|
||||||
|
|
||||||
String testEntityBasePath;
|
private String testEntityBasePath;
|
||||||
DataInfo dataInfo;
|
private DataInfo dataInfo;
|
||||||
String dedupId = "00|dedup_id::1";
|
private String dedupId = "00|dedup_id::1";
|
||||||
Publication pub_top;
|
private Publication pub_top;
|
||||||
|
|
||||||
@BeforeEach
|
@BeforeEach
|
||||||
public void setUp() throws Exception {
|
public void setUp() throws Exception {
|
||||||
|
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
|
||||||
Software merged = DedupRecordFactory
|
Software merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
|
||||||
|
|
||||||
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE");
|
assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
|
||||||
|
|
||||||
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340");
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
|
||||||
|
|
||||||
assertEquals(pub_merged.getJournal(), pub_top.getJournal());
|
assertEquals(pub_top.getJournal(), pub_merged.getJournal());
|
||||||
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN");
|
assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
|
||||||
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype());
|
assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
|
||||||
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage());
|
assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
|
||||||
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher());
|
assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
|
||||||
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate());
|
assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
|
||||||
assertEquals(pub_merged.getResourcetype().getClassid(), "0004");
|
assertEquals(pub_top.getResourcetype().getClassid(), "");
|
||||||
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation());
|
assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
|
||||||
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance());
|
assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
|
||||||
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection());
|
assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
|
||||||
assertEquals(pub_merged.getInstance().size(), 3);
|
assertEquals(3, pub_merged.getInstance().size());
|
||||||
assertEquals(pub_merged.getCountry().size(), 2);
|
assertEquals(2, pub_merged.getCountry().size());
|
||||||
assertEquals(pub_merged.getSubject().size(), 0);
|
assertEquals(0, pub_merged.getSubject().size());
|
||||||
assertEquals(pub_merged.getTitle().size(), 2);
|
assertEquals(2, pub_merged.getTitle().size());
|
||||||
assertEquals(pub_merged.getRelevantdate().size(), 0);
|
assertEquals(0, pub_merged.getRelevantdate().size());
|
||||||
assertEquals(pub_merged.getDescription().size(), 0);
|
assertEquals(0, pub_merged.getDescription().size());
|
||||||
assertEquals(pub_merged.getSource().size(), 0);
|
assertEquals(0, pub_merged.getSource().size());
|
||||||
assertEquals(pub_merged.getFulltext().size(), 0);
|
assertEquals(0, pub_merged.getFulltext().size());
|
||||||
assertEquals(pub_merged.getFormat().size(), 0);
|
assertEquals(0, pub_merged.getFormat().size());
|
||||||
assertEquals(pub_merged.getContributor().size(), 0);
|
assertEquals(0, pub_merged.getContributor().size());
|
||||||
assertEquals(pub_merged.getCoverage().size(), 0);
|
assertEquals(0, pub_merged.getCoverage().size());
|
||||||
assertEquals(pub_merged.getContext().size(), 0);
|
assertEquals(0, pub_merged.getContext().size());
|
||||||
assertEquals(pub_merged.getExternalReference().size(), 0);
|
assertEquals(0, pub_merged.getExternalReference().size());
|
||||||
assertEquals(pub_merged.getOriginalId().size(), 3);
|
assertEquals(3, pub_merged.getOriginalId().size());
|
||||||
assertEquals(pub_merged.getCollectedfrom().size(), 3);
|
assertEquals(3, pub_merged.getCollectedfrom().size());
|
||||||
assertEquals(pub_merged.getPid().size(), 1);
|
assertEquals(1, pub_merged.getPid().size());
|
||||||
assertEquals(pub_merged.getExtraInfo().size(), 0);
|
assertEquals(0, pub_merged.getExtraInfo().size());
|
||||||
|
|
||||||
// verify datainfo
|
// verify datainfo
|
||||||
assertEquals(pub_merged.getDataInfo(), dataInfo);
|
assertEquals(dataInfo, pub_merged.getDataInfo());
|
||||||
|
|
||||||
// verify datepicker
|
// verify datepicker
|
||||||
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30");
|
assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
|
||||||
|
|
||||||
// verify authors
|
// verify authors
|
||||||
assertEquals(pub_merged.getAuthor().size(), 9);
|
assertEquals(9, pub_merged.getAuthor().size());
|
||||||
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4);
|
assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
|
||||||
|
|
||||||
// verify title
|
// verify title
|
||||||
int count = 0;
|
int count = 0;
|
||||||
|
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
if (title.getQualifier().getClassid().equals("main title"))
|
if (title.getQualifier().getClassid().equals("main title"))
|
||||||
count++;
|
count++;
|
||||||
}
|
}
|
||||||
assertEquals(count, 1);
|
assertEquals(1, count);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
|
|
||||||
assertEquals(pub_merged.getAuthor().size(), 27);
|
assertEquals(27, pub_merged.getAuthor().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
|
||||||
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
|
||||||
|
|
||||||
|
System.out
|
||||||
|
.println(
|
||||||
|
publications5
|
||||||
|
.stream()
|
||||||
|
.map(p -> p._2().getId())
|
||||||
|
.collect(Collectors.toList()));
|
||||||
|
|
||||||
Publication pub_merged = DedupRecordFactory
|
Publication pub_merged = DedupRecordFactory
|
||||||
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
|
||||||
|
|
||||||
// verify id
|
// verify id
|
||||||
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId());
|
assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
|
||||||
}
|
}
|
||||||
|
|
||||||
public DataInfo setDI() {
|
public DataInfo setDI() {
|
||||||
|
|
|
@ -7,96 +7,57 @@ import java.io.BufferedReader;
|
||||||
import java.io.FileReader;
|
import java.io.FileReader;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
import java.text.SimpleDateFormat;
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.codehaus.jackson.map.ObjectMapper;
|
import org.junit.jupiter.api.BeforeAll;
|
||||||
import org.junit.jupiter.api.*;
|
import org.junit.jupiter.api.MethodOrderer;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.TestMethodOrder;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Lists;
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class IdGeneratorTest {
|
public class IdGeneratorTest {
|
||||||
|
|
||||||
private static List<Identifier> bestIds;
|
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||||
private static List<Tuple2<String, Publication>> pubs;
|
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||||
|
|
||||||
private static List<Identifier> bestIds2;
|
private static List<Identifier<Publication>> bestIds;
|
||||||
private static List<Identifier> bestIds3;
|
private static List<Identifier<Publication>> bestIds2;
|
||||||
|
private static List<Identifier<Publication>> bestIds3;
|
||||||
|
|
||||||
private static String testEntityBasePath;
|
private static String testEntityBasePath;
|
||||||
|
|
||||||
private static SimpleDateFormat sdf;
|
|
||||||
private static Date baseDate;
|
|
||||||
|
|
||||||
@BeforeAll
|
@BeforeAll
|
||||||
public static void setUp() throws Exception {
|
public static void setUp() throws Exception {
|
||||||
|
|
||||||
sdf = new SimpleDateFormat("yyyy-MM-dd");
|
|
||||||
baseDate = sdf.parse("2000-01-01");
|
|
||||||
|
|
||||||
bestIds = new ArrayList<>();
|
|
||||||
bestIds2 = Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
|
||||||
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
|
|
||||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
|
||||||
bestIds3 = Lists
|
|
||||||
.newArrayList(
|
|
||||||
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
|
|
||||||
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
|
|
||||||
EntityType.publication, "50|originalID2"),
|
|
||||||
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
|
|
||||||
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
|
|
||||||
|
|
||||||
testEntityBasePath = Paths
|
testEntityBasePath = Paths
|
||||||
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
|
||||||
.toFile()
|
.toFile()
|
||||||
.getAbsolutePath();
|
.getAbsolutePath();
|
||||||
|
|
||||||
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
|
||||||
|
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
|
||||||
|
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Order(1)
|
|
||||||
public void bestPidToIdentifierTest() {
|
|
||||||
|
|
||||||
List<String> typesForAssertions = Lists
|
|
||||||
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
|
|
||||||
|
|
||||||
for (Tuple2<String, Publication> pub : pubs) {
|
|
||||||
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
|
|
||||||
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
|
|
||||||
bestIds.addAll(ids);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@Test
|
|
||||||
@Order(2)
|
|
||||||
public void generateIdTest1() {
|
public void generateIdTest1() {
|
||||||
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
String id1 = IdGenerator.generate(bestIds, "50|defaultID");
|
||||||
|
|
||||||
System.out.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
System.out
|
||||||
|
.println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
|
|
||||||
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1);
|
assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -104,13 +65,23 @@ public class IdGeneratorTest {
|
||||||
String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
|
String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
|
||||||
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
|
||||||
|
|
||||||
System.out.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
System.out
|
||||||
|
.println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
System.out.println("winner 2 = " + id1);
|
System.out.println("winner 2 = " + id1);
|
||||||
System.out.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList()));
|
System.out
|
||||||
|
.println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
|
||||||
System.out.println("winner 3 = " + id2);
|
System.out.println("winner 3 = " + id2);
|
||||||
|
|
||||||
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1);
|
assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
|
||||||
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2);
|
assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
|
||||||
|
final Stream<Identifier<T>> ids = readSample(path, clazz)
|
||||||
|
.stream()
|
||||||
|
.map(Tuple2::_2)
|
||||||
|
.map(Identifier::newInstance);
|
||||||
|
return ids.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
|
||||||
|
@ -124,7 +95,7 @@ public class IdGeneratorTest {
|
||||||
.add(
|
.add(
|
||||||
new Tuple2<>(
|
new Tuple2<>(
|
||||||
MapDocumentUtil.getJPathString("$.id", line),
|
MapDocumentUtil.getJPathString("$.id", line),
|
||||||
new ObjectMapper().readValue(line, clazz)));
|
OBJECT_MAPPER.readValue(line, clazz)));
|
||||||
// read next line
|
// read next line
|
||||||
line = reader.readLine();
|
line = reader.readLine();
|
||||||
}
|
}
|
||||||
|
@ -137,23 +108,10 @@ public class IdGeneratorTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
public static StructuredProperty pid(String pid, String classid, String classname) {
|
public static StructuredProperty pid(String pid, String classid, String classname) {
|
||||||
|
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
|
||||||
StructuredProperty sp = new StructuredProperty();
|
|
||||||
sp.setValue(pid);
|
|
||||||
Qualifier q = new Qualifier();
|
|
||||||
q.setSchemeid(classid);
|
|
||||||
q.setSchemename(classname);
|
|
||||||
q.setClassname(classname);
|
|
||||||
q.setClassid(classid);
|
|
||||||
sp.setQualifier(q);
|
|
||||||
return sp;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static List<KeyValue> keyValue(String key, String value) {
|
public static List<KeyValue> keyValue(String key, String value) {
|
||||||
|
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
|
||||||
KeyValue kv = new KeyValue();
|
|
||||||
kv.setKey(key);
|
|
||||||
kv.setValue(value);
|
|
||||||
return Lists.newArrayList(kv);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,12 +1,18 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.dedup;
|
package eu.dnetlib.dhp.oa.dedup;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import static java.nio.file.Files.createTempDirectory;
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import static org.apache.spark.sql.functions.count;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import static org.mockito.Mockito.lenient;
|
||||||
import eu.dnetlib.pace.util.MapDocumentUtil;
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.net.URISyntaxException;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
|
@ -25,19 +31,16 @@ import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
import org.mockito.Mock;
|
import org.mockito.Mock;
|
||||||
import org.mockito.Mockito;
|
import org.mockito.Mockito;
|
||||||
import org.mockito.junit.jupiter.MockitoExtension;
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
import eu.dnetlib.pace.util.MapDocumentUtil;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.Serializable;
|
|
||||||
import java.net.URISyntaxException;
|
|
||||||
import java.nio.file.Paths;
|
|
||||||
|
|
||||||
import static java.nio.file.Files.createTempDirectory;
|
|
||||||
import static org.apache.spark.sql.functions.count;
|
|
||||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
|
||||||
import static org.mockito.Mockito.lenient;
|
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||||
public class SparkDedupTest implements Serializable {
|
public class SparkDedupTest implements Serializable {
|
||||||
|
|
File diff suppressed because one or more lines are too long
|
@ -0,0 +1,3 @@
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
|
@ -0,0 +1,3 @@
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||||
|
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -1,13 +1,15 @@
|
||||||
package eu.dnetlib.doiboost
|
package eu.dnetlib.doiboost
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization}
|
import eu.dnetlib.dhp.oa.merge.AuthorMerger
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
|
||||||
import eu.dnetlib.doiboost.mag.ConversionUtil
|
import eu.dnetlib.doiboost.mag.ConversionUtil
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.SparkConf
|
import org.apache.spark.SparkConf
|
||||||
import org.apache.spark.sql.functions.col
|
import org.apache.spark.sql.functions.col
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.{Logger, LoggerFactory}
|
import org.slf4j.{Logger, LoggerFactory}
|
||||||
|
|
||||||
import scala.collection.JavaConverters._
|
import scala.collection.JavaConverters._
|
||||||
|
|
||||||
object SparkGenerateDoiBoost {
|
object SparkGenerateDoiBoost {
|
||||||
|
@ -49,6 +51,7 @@ object SparkGenerateDoiBoost {
|
||||||
val otherPub = item._2._2
|
val otherPub = item._2._2
|
||||||
if (otherPub != null) {
|
if (otherPub != null) {
|
||||||
crossrefPub.mergeFrom(otherPub)
|
crossrefPub.mergeFrom(otherPub)
|
||||||
|
crossrefPub.setAuthor(AuthorMerger.mergeAuthor(crossrefPub.getAuthor, otherPub.getAuthor))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
crossrefPub
|
crossrefPub
|
||||||
|
|
|
@ -38,6 +38,8 @@ class QueryTest {
|
||||||
def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
|
def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
|
||||||
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)
|
||||||
|
|
||||||
|
|
|
@ -3,13 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
import java.util.Optional;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.spark.SparkConf;
|
import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.function.MapFunction;
|
import org.apache.spark.api.java.function.MapFunction;
|
||||||
import org.apache.spark.sql.Dataset;
|
import org.apache.spark.sql.Dataset;
|
||||||
|
@ -23,10 +19,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
|
@ -1,8 +1,8 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
|
||||||
|
|
||||||
import java.util.*;
|
import java.util.*;
|
||||||
|
|
||||||
|
@ -12,10 +12,13 @@ import org.dom4j.DocumentFactory;
|
||||||
import org.dom4j.DocumentHelper;
|
import org.dom4j.DocumentHelper;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
|
import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
import eu.dnetlib.dhp.schema.common.LicenseComparator;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
|
|
||||||
public abstract class AbstractMdRecordToOafMapper {
|
public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
@ -133,20 +136,34 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final DataInfo info,
|
final DataInfo info,
|
||||||
final long lastUpdateTimestamp) {
|
final long lastUpdateTimestamp) {
|
||||||
|
|
||||||
final List<Oaf> oafs = new ArrayList<>();
|
final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
|
final String id = IdentifierFactory.createIdentifier(entity);
|
||||||
|
if (!id.equals(entity.getId())) {
|
||||||
|
entity.getOriginalId().add(entity.getId());
|
||||||
|
entity.setId(id);
|
||||||
|
}
|
||||||
|
|
||||||
|
final List<Oaf> oafs = Lists.newArrayList(entity);
|
||||||
|
|
||||||
|
if (!oafs.isEmpty()) {
|
||||||
|
oafs.addAll(addProjectRels(doc, entity));
|
||||||
|
oafs.addAll(addOtherResultRels(doc, entity));
|
||||||
|
}
|
||||||
|
|
||||||
|
return oafs;
|
||||||
|
}
|
||||||
|
|
||||||
|
private OafEntity createEntity(Document doc, String type, List<Instance> instances, KeyValue collectedFrom,
|
||||||
|
DataInfo info, long lastUpdateTimestamp) {
|
||||||
switch (type.toLowerCase()) {
|
switch (type.toLowerCase()) {
|
||||||
case "publication":
|
case "publication":
|
||||||
final Publication p = new Publication();
|
final Publication p = new Publication();
|
||||||
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
|
|
||||||
p.setJournal(prepareJournal(doc, info));
|
p.setJournal(prepareJournal(doc, info));
|
||||||
oafs.add(p);
|
return p;
|
||||||
break;
|
|
||||||
case "dataset":
|
case "dataset":
|
||||||
final Dataset d = new Dataset();
|
final Dataset d = new Dataset();
|
||||||
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
|
|
||||||
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
d.setStoragedate(prepareDatasetStorageDate(doc, info));
|
||||||
d.setDevice(prepareDatasetDevice(doc, info));
|
d.setDevice(prepareDatasetDevice(doc, info));
|
||||||
d.setSize(prepareDatasetSize(doc, info));
|
d.setSize(prepareDatasetSize(doc, info));
|
||||||
|
@ -154,48 +171,34 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
|
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
|
||||||
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
|
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
|
||||||
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
|
d.setGeolocation(prepareDatasetGeoLocations(doc, info));
|
||||||
oafs.add(d);
|
return d;
|
||||||
break;
|
|
||||||
case "software":
|
case "software":
|
||||||
final Software s = new Software();
|
final Software s = new Software();
|
||||||
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
|
|
||||||
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
|
||||||
s.setLicense(prepareSoftwareLicenses(doc, info));
|
s.setLicense(prepareSoftwareLicenses(doc, info));
|
||||||
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
|
||||||
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
|
||||||
oafs.add(s);
|
return s;
|
||||||
break;
|
|
||||||
case "":
|
case "":
|
||||||
case "otherresearchproducts":
|
case "otherresearchproducts":
|
||||||
default:
|
default:
|
||||||
final OtherResearchProduct o = new OtherResearchProduct();
|
final OtherResearchProduct o = new OtherResearchProduct();
|
||||||
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
|
||||||
o.setResulttype(ORP_DEFAULT_RESULTTYPE);
|
|
||||||
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
|
||||||
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
|
||||||
o.setTool(prepareOtherResearchProductTools(doc, info));
|
o.setTool(prepareOtherResearchProductTools(doc, info));
|
||||||
oafs.add(o);
|
return o;
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!oafs.isEmpty()) {
|
|
||||||
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
|
||||||
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
|
|
||||||
}
|
|
||||||
|
|
||||||
return oafs;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private List<Oaf> addProjectRels(
|
private List<Oaf> addProjectRels(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final KeyValue collectedFrom,
|
final OafEntity entity) {
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
|
|
||||||
final List<Oaf> res = new ArrayList<>();
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
final String docId = entity.getId();
|
||||||
|
|
||||||
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
for (final Object o : doc.selectNodes("//oaf:projectid")) {
|
||||||
|
|
||||||
|
@ -207,13 +210,11 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info,
|
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info,
|
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -225,26 +226,22 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
final String relType,
|
final String relType,
|
||||||
final String subRelType,
|
final String subRelType,
|
||||||
final String relClass,
|
final String relClass,
|
||||||
final KeyValue collectedFrom,
|
final OafEntity entity) {
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
final Relation rel = new Relation();
|
final Relation rel = new Relation();
|
||||||
rel.setRelType(relType);
|
rel.setRelType(relType);
|
||||||
rel.setSubRelType(subRelType);
|
rel.setSubRelType(subRelType);
|
||||||
rel.setRelClass(relClass);
|
rel.setRelClass(relClass);
|
||||||
rel.setSource(source);
|
rel.setSource(source);
|
||||||
rel.setTarget(target);
|
rel.setTarget(target);
|
||||||
rel.setCollectedfrom(Arrays.asList(collectedFrom));
|
rel.setCollectedfrom(entity.getCollectedfrom());
|
||||||
rel.setDataInfo(info);
|
rel.setDataInfo(entity.getDataInfo());
|
||||||
rel.setLastupdatetimestamp(lastUpdateTimestamp);
|
rel.setLastupdatetimestamp(entity.getLastupdatetimestamp());
|
||||||
return rel;
|
return rel;
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract List<Oaf> addOtherResultRels(
|
protected abstract List<Oaf> addOtherResultRels(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final KeyValue collectedFrom,
|
final OafEntity entity);
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp);
|
|
||||||
|
|
||||||
private void populateResultFields(
|
private void populateResultFields(
|
||||||
final Result r,
|
final Result r,
|
||||||
|
@ -257,7 +254,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
|
||||||
|
|
||||||
r.setOriginalId(Arrays.asList(findOriginalId(doc)));
|
r.setOriginalId(Lists.newArrayList(findOriginalId(doc)));
|
||||||
|
|
||||||
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
r.setCollectedfrom(Arrays.asList(collectedFrom));
|
||||||
r.setPid(prepareResultPids(doc, info));
|
r.setPid(prepareResultPids(doc, info));
|
||||||
|
@ -285,7 +282,7 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
|
||||||
|
|
||||||
r.setInstance(instances);
|
r.setInstance(instances);
|
||||||
r.setBestaccessright(getBestAccessRights(instances));
|
r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
|
||||||
}
|
}
|
||||||
|
|
||||||
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
|
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
|
||||||
|
@ -370,38 +367,6 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
|
||||||
|
|
||||||
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
|
|
||||||
return getBestAccessRights(instanceList);
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
|
|
||||||
if (instanceList != null) {
|
|
||||||
final Optional<Qualifier> min = instanceList
|
|
||||||
.stream()
|
|
||||||
.map(i -> i.getAccessright())
|
|
||||||
.min(new LicenseComparator());
|
|
||||||
|
|
||||||
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
|
|
||||||
|
|
||||||
if (StringUtils.isBlank(rights.getClassid())) {
|
|
||||||
rights.setClassid(UNKNOWN);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getClassname())
|
|
||||||
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
|
|
||||||
rights.setClassname(NOT_AVAILABLE);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getSchemeid())) {
|
|
||||||
rights.setSchemeid(DNET_ACCESS_MODES);
|
|
||||||
}
|
|
||||||
if (StringUtils.isBlank(rights.getSchemename())) {
|
|
||||||
rights.setSchemename(DNET_ACCESS_MODES);
|
|
||||||
}
|
|
||||||
|
|
||||||
return rights;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
private Journal prepareJournal(final Document doc, final DataInfo info) {
|
||||||
final Node n = doc.selectSingleNode("//oaf:journal");
|
final Node n = doc.selectSingleNode("//oaf:journal");
|
||||||
if (n != null) {
|
if (n != null) {
|
||||||
|
@ -564,4 +529,5 @@ public abstract class AbstractMdRecordToOafMapper {
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,11 +4,9 @@ package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
import java.util.*;
|
||||||
import java.util.List;
|
|
||||||
import java.util.Objects;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.Collectors;
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -20,6 +18,7 @@ import org.apache.spark.SparkConf;
|
||||||
import org.apache.spark.api.java.JavaRDD;
|
import org.apache.spark.api.java.JavaRDD;
|
||||||
import org.apache.spark.api.java.JavaSparkContext;
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import org.jetbrains.annotations.NotNull;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
@ -29,16 +28,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
|
||||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
@ -124,7 +114,18 @@ public class GenerateEntitiesApplication {
|
||||||
|
|
||||||
private static Oaf merge(final Oaf o1, final Oaf o2) {
|
private static Oaf merge(final Oaf o1, final Oaf o2) {
|
||||||
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
if (ModelSupport.isSubClass(o1, OafEntity.class)) {
|
||||||
((OafEntity) o1).mergeFrom((OafEntity) o2);
|
if (ModelSupport.isSubClass(o1, Result.class)) {
|
||||||
|
|
||||||
|
return mergeResults((Result) o1, (Result) o2);
|
||||||
|
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
|
||||||
|
((Datasource) o1).mergeFrom((Datasource) o2);
|
||||||
|
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
|
||||||
|
((Organization) o1).mergeFrom((Organization) o2);
|
||||||
|
} else if (ModelSupport.isSubClass(o1, Project.class)) {
|
||||||
|
((Project) o1).mergeFrom((Project) o2);
|
||||||
|
} else {
|
||||||
|
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
|
||||||
|
}
|
||||||
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
|
} else if (ModelSupport.isSubClass(o1, Relation.class)) {
|
||||||
((Relation) o1).mergeFrom((Relation) o2);
|
((Relation) o1).mergeFrom((Relation) o2);
|
||||||
} else {
|
} else {
|
||||||
|
@ -133,6 +134,19 @@ public class GenerateEntitiesApplication {
|
||||||
return o1;
|
return o1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected static Result mergeResults(Result o1, Result o2) {
|
||||||
|
Result r1 = o1;
|
||||||
|
Result r2 = o2;
|
||||||
|
|
||||||
|
if (new ResultTypeComparator().compare(r1, r2) < 0) {
|
||||||
|
r1.mergeFrom(r2);
|
||||||
|
return r1;
|
||||||
|
} else {
|
||||||
|
r2.mergeFrom(r1);
|
||||||
|
return r2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private static List<Oaf> convertToListOaf(
|
private static List<Oaf> convertToListOaf(
|
||||||
final String id,
|
final String id,
|
||||||
final String s,
|
final String s,
|
||||||
|
|
|
@ -1,16 +1,6 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
|
||||||
|
@ -19,19 +9,25 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
@ -445,22 +441,26 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
|
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
final String targetType = rs.getString(TARGET_TYPE);
|
||||||
if (rs.getString(SOURCE_TYPE).equals("context")) {
|
if (rs.getString(SOURCE_TYPE).equals("context")) {
|
||||||
final Result r;
|
final Result r;
|
||||||
|
|
||||||
if (rs.getString(TARGET_TYPE).equals("dataset")) {
|
switch (targetType) {
|
||||||
|
case "dataset":
|
||||||
r = new Dataset();
|
r = new Dataset();
|
||||||
r.setResulttype(DATASET_DEFAULT_RESULTTYPE);
|
break;
|
||||||
} else if (rs.getString(TARGET_TYPE).equals("software")) {
|
case "software":
|
||||||
r = new Software();
|
r = new Software();
|
||||||
r.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
|
break;
|
||||||
} else if (rs.getString(TARGET_TYPE).equals("other")) {
|
case "other":
|
||||||
r = new OtherResearchProduct();
|
r = new OtherResearchProduct();
|
||||||
r.setResulttype(ORP_DEFAULT_RESULTTYPE);
|
break;
|
||||||
} else {
|
case "publication":
|
||||||
|
default:
|
||||||
r = new Publication();
|
r = new Publication();
|
||||||
r.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
|
r.setId(createOpenaireId(50, rs.getString("target_id"), false));
|
||||||
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
r.setLastupdatetimestamp(lastUpdateTimestamp);
|
||||||
r.setContext(prepareContext(rs.getString("source_id"), info));
|
r.setContext(prepareContext(rs.getString("source_id"), info));
|
||||||
|
@ -470,7 +470,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
||||||
return Arrays.asList(r);
|
return Arrays.asList(r);
|
||||||
} else {
|
} else {
|
||||||
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
|
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
|
||||||
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false);
|
final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false);
|
||||||
|
|
||||||
final Relation r1 = new Relation();
|
final Relation r1 = new Relation();
|
||||||
final Relation r2 = new Relation();
|
final Relation r2 = new Relation();
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
@ -19,15 +19,8 @@ import com.google.common.collect.Lists;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
@ -93,7 +86,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||||
return prepareListFields(doc, "//dc:description", info);
|
return prepareListFields(doc, "//dc:description", info)
|
||||||
|
.stream()
|
||||||
|
.map(d -> {
|
||||||
|
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||||
|
return d;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -257,11 +256,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
@Override
|
@Override
|
||||||
protected List<Oaf> addOtherResultRels(
|
protected List<Oaf> addOtherResultRels(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final KeyValue collectedFrom,
|
final OafEntity entity) {
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
|
||||||
|
|
||||||
|
final String docId = entity.getId();
|
||||||
final List<Oaf> res = new ArrayList<>();
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
|
||||||
|
@ -275,13 +272,11 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info,
|
otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return res;
|
return res;
|
||||||
|
@ -295,6 +290,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
@Override
|
@Override
|
||||||
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
|
||||||
return prepareListStructPropsWithValidQualifier(
|
return prepareListStructPropsWithValidQualifier(
|
||||||
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info);
|
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
|
||||||
|
.stream()
|
||||||
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,34 +1,26 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
|
||||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
|
|
||||||
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
|
||||||
|
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
|
||||||
|
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
import org.dom4j.Document;
|
import org.dom4j.Document;
|
||||||
import org.dom4j.Node;
|
import org.dom4j.Node;
|
||||||
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.PacePerson;
|
import eu.dnetlib.dhp.common.PacePerson;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
|
|
||||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
|
@ -198,7 +190,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
|
||||||
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info);
|
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info)
|
||||||
|
.stream()
|
||||||
|
.map(d -> {
|
||||||
|
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||||
|
return d;
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
|
@ -314,11 +312,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
@Override
|
@Override
|
||||||
protected List<Oaf> addOtherResultRels(
|
protected List<Oaf> addOtherResultRels(
|
||||||
final Document doc,
|
final Document doc,
|
||||||
final KeyValue collectedFrom,
|
final OafEntity entity) {
|
||||||
final DataInfo info,
|
|
||||||
final long lastUpdateTimestamp) {
|
|
||||||
|
|
||||||
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
|
final String docId = entity.getId();
|
||||||
|
|
||||||
final List<Oaf> res = new ArrayList<>();
|
final List<Oaf> res = new ArrayList<>();
|
||||||
|
|
||||||
|
@ -330,30 +326,26 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
final String otherId = createOpenaireId(50, originalId, false);
|
final String otherId = createOpenaireId(50, originalId, false);
|
||||||
final String type = ((Node) o).valueOf("@relationType");
|
final String type = ((Node) o).valueOf("@relationType");
|
||||||
|
|
||||||
if (type.equalsIgnoreCase("IsSupplementTo")) {
|
if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) {
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info,
|
docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info,
|
otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity));
|
||||||
lastUpdateTimestamp));
|
} else if (type.equalsIgnoreCase(IS_PART_OF)) {
|
||||||
} else if (type.equals("IsPartOf")) {
|
|
||||||
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info,
|
docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
res
|
res
|
||||||
.add(
|
.add(
|
||||||
getRelation(
|
getRelation(
|
||||||
otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info,
|
otherId, docId, RESULT_RESULT, PART, HAS_PARTS, entity));
|
||||||
lastUpdateTimestamp));
|
|
||||||
} else {
|
} else {
|
||||||
|
// TODO catch more semantics
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -384,7 +376,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||||
doc,
|
doc,
|
||||||
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
|
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
|
||||||
"@alternateIdentifierType", DNET_PID_TYPES, info));
|
"@alternateIdentifierType", DNET_PID_TYPES, info));
|
||||||
return Lists.newArrayList(res);
|
|
||||||
|
return res
|
||||||
|
.stream()
|
||||||
|
.map(CleaningFunctions::normalizePidValue)
|
||||||
|
.collect(Collectors.toList());
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
import com.google.common.collect.Maps;
|
import com.google.common.collect.Maps;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
|
|
||||||
public class Vocabulary implements Serializable {
|
public class Vocabulary implements Serializable {
|
||||||
|
|
|
@ -7,6 +7,7 @@ import java.util.stream.Collectors;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
|
@ -4,7 +4,6 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
|
||||||
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
|
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
|
||||||
import eu.dnetlib.dhp.sx.ebi.EBIAggregator
|
import eu.dnetlib.dhp.sx.ebi.EBIAggregator
|
||||||
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal}
|
|
||||||
import org.apache.commons.io.IOUtils
|
import org.apache.commons.io.IOUtils
|
||||||
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
|
||||||
import org.slf4j.LoggerFactory
|
import org.slf4j.LoggerFactory
|
||||||
|
@ -18,38 +17,38 @@ object SparkSplitOafTODLIEntities {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
def main(args: Array[String]): Unit = {
|
|
||||||
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
|
|
||||||
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
|
|
||||||
parser.parseArgument(args)
|
|
||||||
|
|
||||||
val workingPath: String = parser.get("workingPath")
|
def extract_dataset(spark:SparkSession, workingPath:String) :Unit = {
|
||||||
logger.info(s"Working dir path = $workingPath")
|
|
||||||
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
|
||||||
|
|
||||||
|
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf].repartition(4000)
|
||||||
|
|
||||||
|
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset].repartition(1000)
|
||||||
|
|
||||||
|
|
||||||
|
OAFDataset
|
||||||
|
.filter(s => s != null && s.isInstanceOf[DLIDataset])
|
||||||
|
.map(s =>s.asInstanceOf[DLIDataset])
|
||||||
|
.union(ebi_dataset)
|
||||||
|
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
|
||||||
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
|
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
|
||||||
|
.map(p => p._2)
|
||||||
|
.repartition(2000)
|
||||||
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
def extract_publication(spark:SparkSession, workingPath:String) :Unit = {
|
||||||
|
|
||||||
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||||
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
|
|
||||||
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
|
|
||||||
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val spark:SparkSession = SparkSession
|
|
||||||
.builder()
|
|
||||||
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
|
|
||||||
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
|
||||||
.master(parser.get("master"))
|
|
||||||
.getOrCreate()
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
||||||
|
|
||||||
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset]
|
val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication].repartition(1000)
|
||||||
val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication]
|
|
||||||
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation]
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
OAFDataset
|
OAFDataset
|
||||||
|
@ -60,20 +59,17 @@ object SparkSplitOafTODLIEntities {
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
|
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
|
||||||
.map(p => p._2)
|
.map(p => p._2)
|
||||||
.repartition(1000)
|
.repartition(2000)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication")
|
||||||
|
|
||||||
OAFDataset
|
}
|
||||||
.filter(s => s != null && s.isInstanceOf[DLIDataset])
|
|
||||||
.map(s =>s.asInstanceOf[DLIDataset])
|
|
||||||
.union(ebi_dataset)
|
|
||||||
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
|
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
|
||||||
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
|
|
||||||
.map(p => p._2)
|
|
||||||
.repartition(1000)
|
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
|
|
||||||
|
|
||||||
|
def extract_unknown(spark:SparkSession, workingPath:String) :Unit = {
|
||||||
|
|
||||||
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
|
||||||
|
|
||||||
|
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
||||||
|
|
||||||
OAFDataset
|
OAFDataset
|
||||||
.filter(s => s != null && s.isInstanceOf[DLIUnknown])
|
.filter(s => s != null && s.isInstanceOf[DLIUnknown])
|
||||||
|
@ -82,9 +78,18 @@ object SparkSplitOafTODLIEntities {
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
.agg(EBIAggregator.getDLIUnknownAggregator().toColumn)
|
.agg(EBIAggregator.getDLIUnknownAggregator().toColumn)
|
||||||
.map(p => p._2)
|
.map(p => p._2)
|
||||||
.repartition(1000)
|
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
|
||||||
|
|
||||||
|
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
|
||||||
|
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
|
||||||
|
|
||||||
|
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
|
||||||
|
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
|
||||||
|
|
||||||
OAFDataset
|
OAFDataset
|
||||||
.filter(s => s != null && s.isInstanceOf[Relation])
|
.filter(s => s != null && s.isInstanceOf[Relation])
|
||||||
|
@ -94,9 +99,35 @@ object SparkSplitOafTODLIEntities {
|
||||||
.groupByKey(_._1)(Encoders.STRING)
|
.groupByKey(_._1)(Encoders.STRING)
|
||||||
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
.agg(EBIAggregator.getRelationAggregator().toColumn)
|
||||||
.map(p => p._2)
|
.map(p => p._2)
|
||||||
.repartition(1000)
|
.repartition(4000)
|
||||||
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
|
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main(args: Array[String]): Unit = {
|
||||||
|
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
|
||||||
|
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
|
||||||
|
parser.parseArgument(args)
|
||||||
|
|
||||||
|
val workingPath: String = parser.get("workingPath")
|
||||||
|
val entity:String = parser.get("entity")
|
||||||
|
logger.info(s"Working dir path = $workingPath")
|
||||||
|
|
||||||
|
val spark:SparkSession = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
|
||||||
|
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
|
||||||
|
.master(parser.get("master"))
|
||||||
|
.getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
|
entity match {
|
||||||
|
case "publication" => extract_publication(spark, workingPath)
|
||||||
|
case "dataset" => extract_dataset(spark,workingPath)
|
||||||
|
case "relation" => extract_relations(spark, workingPath)
|
||||||
|
case "unknown" => extract_unknown(spark, workingPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true}
|
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true},
|
||||||
|
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the work dir path", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -14,30 +14,103 @@
|
||||||
</property>
|
</property>
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ExtractDLIEntities"/>
|
<start to="ExtractDLIPublication"/>
|
||||||
|
|
||||||
<kill name="Kill">
|
<kill name="Kill">
|
||||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
</kill>
|
</kill>
|
||||||
|
|
||||||
<action name="ExtractDLIEntities">
|
<action name="ExtractDLIPublication">
|
||||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<master>yarn-cluster</master>
|
<master>yarn-cluster</master>
|
||||||
<mode>cluster</mode>
|
<mode>cluster</mode>
|
||||||
<name>Extract DLI Entities</name>
|
<name>Extract DLI Entities (Publication)</name>
|
||||||
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>
|
<spark-opts>
|
||||||
--executor-memory ${sparkExecutorMemory}
|
--executor-memory ${sparkExecutorMemory}
|
||||||
--executor-cores=${sparkExecutorCores}
|
--executor-cores=${sparkExecutorCores}
|
||||||
--driver-memory=${sparkDriverMemory}
|
--driver-memory=${sparkDriverMemory}
|
||||||
--conf spark.sql.shuffle.partitions=3840
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
${sparkExtraOPT}
|
${sparkExtraOPT}
|
||||||
</spark-opts>
|
</spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>-e</arg><arg>publication</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ExtractDLIDataset"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractDLIDataset">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Extract DLI Entities (Dataset)</name>
|
||||||
|
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>-e</arg><arg>dataset</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ExtractDLIUnknown"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractDLIUnknown">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Extract DLI Entities (Unknown)</name>
|
||||||
|
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>-e</arg><arg>unknown</arg>
|
||||||
|
</spark>
|
||||||
|
<ok to="ExtractDLIRelation"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<action name="ExtractDLIRelation">
|
||||||
|
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<master>yarn-cluster</master>
|
||||||
|
<mode>cluster</mode>
|
||||||
|
<name>Extract DLI Entities (Relation)</name>
|
||||||
|
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
|
||||||
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
|
<spark-opts>
|
||||||
|
--executor-memory ${sparkExecutorMemory}
|
||||||
|
--executor-cores=${sparkExecutorCores}
|
||||||
|
--driver-memory=${sparkDriverMemory}
|
||||||
|
--conf spark.sql.shuffle.partitions=5000
|
||||||
|
${sparkExtraOPT}
|
||||||
|
</spark-opts>
|
||||||
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
|
<arg>--workingPath</arg><arg>${workingPath}</arg>
|
||||||
|
<arg>-e</arg><arg>relation</arg>
|
||||||
</spark>
|
</spark>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -7,8 +7,6 @@ import static org.mockito.Mockito.lenient;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
import java.util.stream.Stream;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
|
|
@ -0,0 +1,99 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||||
|
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||||
|
import static org.mockito.Mockito.lenient;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.junit.jupiter.api.BeforeEach;
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
import org.junit.jupiter.api.extension.ExtendWith;
|
||||||
|
import org.mockito.Mock;
|
||||||
|
import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||||
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||||
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
public class GenerateEntitiesApplicationTest {
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private ISLookUpService isLookUpService;
|
||||||
|
|
||||||
|
@Mock
|
||||||
|
private VocabularyGroup vocs;
|
||||||
|
|
||||||
|
@BeforeEach
|
||||||
|
public void setUp() throws IOException, ISLookUpException {
|
||||||
|
|
||||||
|
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||||
|
lenient()
|
||||||
|
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||||
|
.thenReturn(synonyms());
|
||||||
|
|
||||||
|
vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMergeResult() throws IOException {
|
||||||
|
Result publication = getResult("oaf_record.xml", Publication.class);
|
||||||
|
Result dataset = getResult("odf_dataset.xml", Dataset.class);
|
||||||
|
Result software = getResult("odf_software.xml", Software.class);
|
||||||
|
Result orp = getResult("oaf_orp.xml", OtherResearchProduct.class);
|
||||||
|
|
||||||
|
verifyMerge(publication, dataset, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(dataset, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
verifyMerge(publication, software, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(software, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
verifyMerge(publication, orp, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(orp, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
verifyMerge(dataset, software, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(software, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
verifyMerge(dataset, orp, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(orp, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||||
|
|
||||||
|
verifyMerge(software, orp, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
|
||||||
|
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
||||||
|
String resultType) {
|
||||||
|
final Result merge = GenerateEntitiesApplication.mergeResults(publication, dataset);
|
||||||
|
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
||||||
|
assertEquals(resultType, merge.getResulttype().getClassid());
|
||||||
|
}
|
||||||
|
|
||||||
|
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
|
||||||
|
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
|
||||||
|
return new OdfToOafMapper(vocs, false)
|
||||||
|
.processMdRecord(xml)
|
||||||
|
.stream()
|
||||||
|
.filter(s -> clazz.isAssignableFrom(s.getClass()))
|
||||||
|
.map(s -> (Result) s)
|
||||||
|
.findFirst()
|
||||||
|
.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> vocs() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
|
private List<String> synonyms() throws IOException {
|
||||||
|
return IOUtils
|
||||||
|
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -24,14 +24,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
|
||||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||||
|
|
||||||
@ExtendWith(MockitoExtension.class)
|
@ExtendWith(MockitoExtension.class)
|
||||||
|
@ -71,7 +65,7 @@ public class MappersTest {
|
||||||
|
|
||||||
assertValidId(p.getId());
|
assertValidId(p.getId());
|
||||||
|
|
||||||
assertTrue(p.getOriginalId().size() == 1);
|
assertTrue(p.getOriginalId().size() == 2);
|
||||||
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
|
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
|
||||||
|
|
||||||
assertValidId(p.getCollectedfrom().get(0).getKey());
|
assertValidId(p.getCollectedfrom().get(0).getKey());
|
||||||
|
@ -123,22 +117,7 @@ public class MappersTest {
|
||||||
|
|
||||||
assertNotNull(p.getBestaccessright());
|
assertNotNull(p.getBestaccessright());
|
||||||
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
assertEquals("OPEN", p.getBestaccessright().getClassid());
|
||||||
assertValidId(r1.getSource());
|
verifyRelations(p, r1, r2);
|
||||||
assertValidId(r1.getTarget());
|
|
||||||
assertValidId(r2.getSource());
|
|
||||||
assertValidId(r2.getTarget());
|
|
||||||
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
|
||||||
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
|
||||||
assertNotNull(r1.getDataInfo());
|
|
||||||
assertNotNull(r2.getDataInfo());
|
|
||||||
assertNotNull(r1.getDataInfo().getTrust());
|
|
||||||
assertNotNull(r2.getDataInfo().getTrust());
|
|
||||||
assertEquals(r1.getSource(), r2.getTarget());
|
|
||||||
assertEquals(r2.getSource(), r1.getTarget());
|
|
||||||
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
|
|
||||||
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
|
|
||||||
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
|
|
||||||
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
|
|
||||||
|
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
// System.out.println(new ObjectMapper().writeValueAsString(p));
|
||||||
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
// System.out.println(new ObjectMapper().writeValueAsString(r1));
|
||||||
|
@ -177,7 +156,7 @@ public class MappersTest {
|
||||||
final Relation r2 = (Relation) list.get(2);
|
final Relation r2 = (Relation) list.get(2);
|
||||||
|
|
||||||
assertValidId(d.getId());
|
assertValidId(d.getId());
|
||||||
assertTrue(d.getOriginalId().size() == 1);
|
assertTrue(d.getOriginalId().size() == 2);
|
||||||
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
|
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
|
||||||
assertValidId(d.getCollectedfrom().get(0).getKey());
|
assertValidId(d.getCollectedfrom().get(0).getKey());
|
||||||
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
|
||||||
|
@ -230,10 +209,19 @@ public class MappersTest {
|
||||||
});
|
});
|
||||||
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
|
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
|
||||||
|
|
||||||
|
verifyRelations(d, r1, r2);
|
||||||
|
}
|
||||||
|
|
||||||
|
private void verifyRelations(OafEntity e, Relation r1, Relation r2) {
|
||||||
|
assertEquals(e.getId(), r1.getSource());
|
||||||
|
assertEquals(e.getId(), r2.getTarget());
|
||||||
|
|
||||||
assertValidId(r1.getSource());
|
assertValidId(r1.getSource());
|
||||||
assertValidId(r1.getTarget());
|
assertValidId(r1.getTarget());
|
||||||
assertValidId(r2.getSource());
|
assertValidId(r2.getSource());
|
||||||
assertValidId(r2.getTarget());
|
assertValidId(r2.getTarget());
|
||||||
|
assertValidId(r1.getCollectedfrom().get(0).getKey());
|
||||||
|
assertValidId(r2.getCollectedfrom().get(0).getKey());
|
||||||
assertNotNull(r1.getDataInfo());
|
assertNotNull(r1.getDataInfo());
|
||||||
assertNotNull(r2.getDataInfo());
|
assertNotNull(r2.getDataInfo());
|
||||||
assertNotNull(r1.getDataInfo().getTrust());
|
assertNotNull(r1.getDataInfo().getTrust());
|
||||||
|
|
|
@ -27,10 +27,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
|
||||||
import com.fasterxml.jackson.core.type.TypeReference;
|
import com.fasterxml.jackson.core.type.TypeReference;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||||
|
|
|
@ -30,7 +30,7 @@ class SparkScholexplorerAggregationTest {
|
||||||
|
|
||||||
|
|
||||||
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
|
||||||
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate()
|
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
|
||||||
|
|
||||||
|
|
||||||
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]
|
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]
|
||||||
|
|
|
@ -0,0 +1,83 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||||
|
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||||
|
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||||
|
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||||
|
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||||
|
<header xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
|
||||||
|
<dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
|
||||||
|
<dri:dateOfCollection/>
|
||||||
|
<dri:mdFormat/>
|
||||||
|
<dri:mdFormatInterpretation/>
|
||||||
|
<dri:repositoryId/>
|
||||||
|
<dr:objectIdentifier/>
|
||||||
|
<dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
|
||||||
|
<dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
|
||||||
|
<oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
|
||||||
|
</header>
|
||||||
|
<metadata xmlns="http://namespace.openaire.eu/">
|
||||||
|
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
|
||||||
|
<dc:creator>Nikolaidou,Charitini</dc:creator>
|
||||||
|
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
|
||||||
|
<dc:creator>Sgardelis,Steanos</dc:creator>
|
||||||
|
<dc:creator>Halley,John</dc:creator>
|
||||||
|
<dc:creator>Pantis,John</dc:creator>
|
||||||
|
<dc:creator>Tsiafouli,Maria</dc:creator>
|
||||||
|
<dc:date>2017</dc:date>
|
||||||
|
<dc:description>The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management.</dc:description>
|
||||||
|
<dc:format>text/html</dc:format>
|
||||||
|
<dc:identifier>https://doi.org/10.3897/oneeco.2.e13718</dc:identifier>
|
||||||
|
<dc:identifier>https://oneecosystem.pensoft.net/article/13718/</dc:identifier>
|
||||||
|
<dc:language>eng</dc:language>
|
||||||
|
<dc:publisher>Pensoft Publishers</dc:publisher>
|
||||||
|
<dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
|
||||||
|
<dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
|
||||||
|
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||||
|
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||||
|
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||||
|
<dc:subject>Ecosystem Services hotspots</dc:subject>
|
||||||
|
<dc:subject>Natura 2000</dc:subject>
|
||||||
|
<dc:subject>Quiet Protected Areas</dc:subject>
|
||||||
|
<dc:subject>Biodiversity</dc:subject>
|
||||||
|
<dc:subject>Agriculture</dc:subject>
|
||||||
|
<dc:subject>Elevation</dc:subject>
|
||||||
|
<dc:subject>Slope</dc:subject>
|
||||||
|
<dc:subject>Ecosystem Service trade-offs and synergies</dc:subject>
|
||||||
|
<dc:subject> cultural services</dc:subject>
|
||||||
|
<dc:subject>provisioning services</dc:subject>
|
||||||
|
<dc:subject>regulating services</dc:subject>
|
||||||
|
<dc:subject>supporting services</dc:subject>
|
||||||
|
<dc:type>Research Artefact</dc:type>
|
||||||
|
<dr:CobjCategory type="other">0020</dr:CobjCategory>
|
||||||
|
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
|
||||||
|
<oaf:projectid>corda_______::226852</oaf:projectid>
|
||||||
|
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||||
|
<oaf:hostedBy id="openaire____::issn226852" name="One Ecosystem"/>
|
||||||
|
<oaf:collectedFrom
|
||||||
|
id="openaire____::45e3c7b69bcee6cc5fa945c9e183deb9" name="Pensoft"/>
|
||||||
|
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
|
||||||
|
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
|
||||||
|
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
|
||||||
|
<oaf:refereed>0001</oaf:refereed>
|
||||||
|
</metadata>
|
||||||
|
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||||
|
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||||
|
<originDescription altered="true" harvestDate="2020-03-23T00:20:51.392Z">
|
||||||
|
<baseURL>http%3A%2F%2Fzookeys.pensoft.net%2Foai.php</baseURL>
|
||||||
|
<identifier>10.3897/oneeco.2.e13718</identifier>
|
||||||
|
<datestamp>2017-09-08</datestamp>
|
||||||
|
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
|
||||||
|
</originDescription>
|
||||||
|
</provenance>
|
||||||
|
<oaf:datainfo>
|
||||||
|
<oaf:inferred>false</oaf:inferred>
|
||||||
|
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||||
|
<oaf:trust>0.9</oaf:trust>
|
||||||
|
<oaf:inferenceprovenance/>
|
||||||
|
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||||
|
classname="sysimport:crosswalk:repository"
|
||||||
|
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||||
|
</oaf:datainfo>
|
||||||
|
</about>
|
||||||
|
</record>
|
|
@ -62,6 +62,10 @@
|
||||||
<artifactId>dhp-schemas</artifactId>
|
<artifactId>dhp-schemas</artifactId>
|
||||||
<version>${project.version}</version>
|
<version>${project.version}</version>
|
||||||
</dependency>
|
</dependency>
|
||||||
|
<dependency>
|
||||||
|
<groupId>org.apache.httpcomponents</groupId>
|
||||||
|
<artifactId>httpmime</artifactId>
|
||||||
|
</dependency>
|
||||||
|
|
||||||
<dependency>
|
<dependency>
|
||||||
<groupId>org.elasticsearch</groupId>
|
<groupId>org.elasticsearch</groupId>
|
||||||
|
|
|
@ -0,0 +1,111 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.export.zenodo;
|
||||||
|
|
||||||
|
import java.io.*;
|
||||||
|
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.*;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
|
||||||
|
public class MakeTar implements Serializable {
|
||||||
|
|
||||||
|
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
String jsonConfiguration = IOUtils
|
||||||
|
.toString(
|
||||||
|
MakeTar.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
|
||||||
|
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final String outputPath = parser.get("targetPath");
|
||||||
|
log.info("hdfsPath: {}", outputPath);
|
||||||
|
|
||||||
|
final String hdfsNameNode = parser.get("nameNode");
|
||||||
|
log.info("nameNode: {}", hdfsNameNode);
|
||||||
|
|
||||||
|
final String inputPath = parser.get("sourcePath");
|
||||||
|
log.info("input path : {}", inputPath);
|
||||||
|
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
|
||||||
|
makeTArArchive(fileSystem, inputPath, outputPath);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
|
||||||
|
|
||||||
|
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
|
||||||
|
|
||||||
|
while (dir_iterator.hasNext()) {
|
||||||
|
LocatedFileStatus fileStatus = dir_iterator.next();
|
||||||
|
|
||||||
|
Path p = fileStatus.getPath();
|
||||||
|
String p_string = p.toString();
|
||||||
|
String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||||
|
|
||||||
|
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
|
||||||
|
throws IOException {
|
||||||
|
|
||||||
|
Path hdfsWritePath = new Path(outputPath);
|
||||||
|
FSDataOutputStream fsDataOutputStream = null;
|
||||||
|
if (fileSystem.exists(hdfsWritePath)) {
|
||||||
|
fileSystem.delete(hdfsWritePath, true);
|
||||||
|
|
||||||
|
}
|
||||||
|
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||||
|
|
||||||
|
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
|
||||||
|
|
||||||
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
|
.listFiles(
|
||||||
|
new Path(inputPath), true);
|
||||||
|
|
||||||
|
while (fileStatusListIterator.hasNext()) {
|
||||||
|
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||||
|
|
||||||
|
Path p = fileStatus.getPath();
|
||||||
|
String p_string = p.toString();
|
||||||
|
if (!p_string.endsWith("_SUCCESS")) {
|
||||||
|
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||||
|
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
|
||||||
|
entry.setSize(fileStatus.getLen());
|
||||||
|
ar.putArchiveEntry(entry);
|
||||||
|
|
||||||
|
InputStream is = fileSystem.open(fileStatus.getPath());
|
||||||
|
|
||||||
|
BufferedInputStream bis = new BufferedInputStream(is);
|
||||||
|
|
||||||
|
int count;
|
||||||
|
byte data[] = new byte[1024];
|
||||||
|
while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||||
|
ar.write(data, 0, count);
|
||||||
|
}
|
||||||
|
bis.close();
|
||||||
|
ar.closeArchiveEntry();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
ar.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,80 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.export.zenodo;
|
||||||
|
|
||||||
|
import java.io.Serializable;
|
||||||
|
import java.util.Optional;
|
||||||
|
|
||||||
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.commons.logging.Log;
|
||||||
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.fs.*;
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
|
||||||
|
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
|
||||||
|
|
||||||
|
public class SendToZenodoHDFS implements Serializable {
|
||||||
|
|
||||||
|
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
|
||||||
|
|
||||||
|
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
|
||||||
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
|
IOUtils
|
||||||
|
.toString(
|
||||||
|
SendToZenodoHDFS.class
|
||||||
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/export/upload_zenodo.json")));
|
||||||
|
|
||||||
|
parser.parseArgument(args);
|
||||||
|
|
||||||
|
final String hdfsPath = parser.get("hdfsPath");
|
||||||
|
final String hdfsNameNode = parser.get("nameNode");
|
||||||
|
final String access_token = parser.get("accessToken");
|
||||||
|
final String connection_url = parser.get("connectionUrl");
|
||||||
|
final String metadata = parser.get("metadata");
|
||||||
|
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
|
||||||
|
final String concept_rec_id = Optional
|
||||||
|
.ofNullable(parser.get("conceptRecordId"))
|
||||||
|
.orElse(null);
|
||||||
|
|
||||||
|
Configuration conf = new Configuration();
|
||||||
|
conf.set("fs.defaultFS", hdfsNameNode);
|
||||||
|
|
||||||
|
FileSystem fileSystem = FileSystem.get(conf);
|
||||||
|
|
||||||
|
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||||
|
.listFiles(
|
||||||
|
new Path(hdfsPath), true);
|
||||||
|
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
|
||||||
|
if (newDeposition) {
|
||||||
|
zenodoApiClient.newDeposition();
|
||||||
|
} else {
|
||||||
|
if (concept_rec_id == null) {
|
||||||
|
throw new MissingConceptDoiException("No concept record id has been provided");
|
||||||
|
}
|
||||||
|
zenodoApiClient.newVersion(concept_rec_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
while (fileStatusListIterator.hasNext()) {
|
||||||
|
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||||
|
|
||||||
|
Path p = fileStatus.getPath();
|
||||||
|
String p_string = p.toString();
|
||||||
|
if (!p_string.endsWith("_SUCCESS")) {
|
||||||
|
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
|
||||||
|
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||||
|
log.info("Sending information for community: " + name);
|
||||||
|
FSDataInputStream inputStream = fileSystem.open(p);
|
||||||
|
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
zenodoApiClient.sendMretadata(metadata);
|
||||||
|
zenodoApiClient.publish();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,20 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName": "n",
|
||||||
|
"paramLongName": "nameNode",
|
||||||
|
"paramDescription": "the Name Node",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "s",
|
||||||
|
"paramLongName": "sourcePath",
|
||||||
|
"paramDescription": "the source path",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "t",
|
||||||
|
"paramLongName": "targetPath",
|
||||||
|
"paramDescription": "the target path",
|
||||||
|
"paramRequired": true
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,45 @@
|
||||||
|
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"paramName":"nd",
|
||||||
|
"paramLongName":"newDeposition",
|
||||||
|
"paramDescription": "if it is a new deposition (true) or a new version (false)",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"cri",
|
||||||
|
"paramLongName":"conceptRecordId",
|
||||||
|
"paramDescription": "The id of the concept record for a new version",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"hdfsp",
|
||||||
|
"paramLongName":"hdfsPath",
|
||||||
|
"paramDescription": "the path of the folder tofind files to send to Zenodo",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "nn",
|
||||||
|
"paramLongName": "nameNode",
|
||||||
|
"paramDescription": "the name node",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName": "at",
|
||||||
|
"paramLongName": "accessToken",
|
||||||
|
"paramDescription": "the access token for the deposition",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"cu",
|
||||||
|
"paramLongName":"connectionUrl",
|
||||||
|
"paramDescription": "the url to connect to deposit",
|
||||||
|
"paramRequired": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"paramName":"m",
|
||||||
|
"paramLongName":"metadata",
|
||||||
|
"paramDescription": "metadata associated to the deposition",
|
||||||
|
"paramRequired": false
|
||||||
|
}
|
||||||
|
]
|
|
@ -0,0 +1,48 @@
|
||||||
|
<configuration>
|
||||||
|
<property>
|
||||||
|
<name>jobTracker</name>
|
||||||
|
<value>yarnRM</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>nameNode</name>
|
||||||
|
<value>hdfs://nameservice1</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.use.system.libpath</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.action.sharelib.for.spark</name>
|
||||||
|
<value>spark2</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>oozie.wf.rerun.failnodes</name>
|
||||||
|
<value>false</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>hive_metastore_uris</name>
|
||||||
|
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2YarnHistoryServerAddress</name>
|
||||||
|
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2EventLogDir</name>
|
||||||
|
<value>/user/spark/spark2ApplicationHistory</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2ExtraListeners</name>
|
||||||
|
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>spark2SqlQueryExecutionListeners</name>
|
||||||
|
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
<property>
|
||||||
|
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||||
|
<value>true</value>
|
||||||
|
</property>
|
||||||
|
|
||||||
|
</configuration>
|
|
@ -0,0 +1,53 @@
|
||||||
|
<workflow-app name="Send Dump to Zenodo" xmlns="uri:oozie:workflow:0.5">
|
||||||
|
<parameters>
|
||||||
|
<property>
|
||||||
|
<name>sourcePath</name>
|
||||||
|
<description>the source path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>targetPath</name>
|
||||||
|
<description>the target path</description>
|
||||||
|
</property>
|
||||||
|
<property>
|
||||||
|
<name>metadata</name>
|
||||||
|
<description>the metadata</description>
|
||||||
|
</property>
|
||||||
|
</parameters>
|
||||||
|
|
||||||
|
<start to="send_zenodo"/>
|
||||||
|
|
||||||
|
<kill name="Kill">
|
||||||
|
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||||
|
</kill>
|
||||||
|
|
||||||
|
<action name="MakeTar">
|
||||||
|
<java>
|
||||||
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
|
<name-node>${nameNode}</name-node>
|
||||||
|
<main-class>eu.dnetlib.dhp.export.zenodo.MakeTar</main-class>
|
||||||
|
<arg>-t</arg><arg>${targetPath}</arg>
|
||||||
|
<arg>-n</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>-s</arg><arg>${sourcePath}</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
|
||||||
|
<action name="send_zenodo">
|
||||||
|
<java>
|
||||||
|
<main-class>eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS</main-class>
|
||||||
|
<arg>--hdfsPath</arg><arg>/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
|
<arg>--accessToken</arg><arg>b6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv</arg>
|
||||||
|
<arg>--connectionUrl</arg><arg>https://zenodo.org/api/deposit/depositions</arg>
|
||||||
|
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||||
|
<arg>--conceptRecordId</arg><arg>1200252</arg>
|
||||||
|
<arg>--newDeposition</arg><arg>false</arg>
|
||||||
|
</java>
|
||||||
|
<ok to="End"/>
|
||||||
|
<error to="Kill"/>
|
||||||
|
</action>
|
||||||
|
|
||||||
|
<end name="End"/>
|
||||||
|
</workflow-app>
|
|
@ -3,14 +3,15 @@ package eu.dnetlib.dhp.export
|
||||||
import java.time.LocalDateTime
|
import java.time.LocalDateTime
|
||||||
import java.time.format.DateTimeFormatter
|
import java.time.format.DateTimeFormatter
|
||||||
|
|
||||||
|
import eu.dnetlib.dhp.provision.scholix.Scholix
|
||||||
|
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
|
||||||
import eu.dnetlib.dhp.schema.oaf.Relation
|
import eu.dnetlib.dhp.schema.oaf.Relation
|
||||||
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
|
||||||
|
|
||||||
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
|
||||||
import org.junit.jupiter.api.Test
|
import org.junit.jupiter.api.Test
|
||||||
|
|
||||||
import scala.io.Source
|
import scala.io.Source
|
||||||
|
import scala.collection.JavaConverters._
|
||||||
class ExportDLITOOAFTest {
|
class ExportDLITOOAFTest {
|
||||||
|
|
||||||
val mapper = new ObjectMapper()
|
val mapper = new ObjectMapper()
|
||||||
|
@ -22,12 +23,27 @@ class ExportDLITOOAFTest {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extractDatasources(s:Scholix):List[String]= {
|
||||||
|
s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extractDatasources(s:ScholixSummary):List[String] = {
|
||||||
|
|
||||||
|
s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut)
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
def testMappingRele():Unit = {
|
def testMappingRele():Unit = {
|
||||||
|
|
||||||
val r:Relation = new Relation
|
val r:Relation = new Relation
|
||||||
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
|
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
|
||||||
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
|
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
|
||||||
|
r.setRelType("IsReferencedBy")
|
||||||
|
|
||||||
|
|
||||||
val r1 =DLIToOAF.convertDLIRelation(r)
|
val r1 =DLIToOAF.convertDLIRelation(r)
|
||||||
println(r1.getSource, r1.getTarget)
|
println(r1.getSource, r1.getTarget)
|
||||||
|
|
|
@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
|
|
||||||
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
|
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
|
||||||
final StructuredProperty title = result.getTitle().stream().findFirst().get();
|
final StructuredProperty title = result.getTitle().stream().findFirst().get();
|
||||||
title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
|
title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
re.setTitle(title);
|
re.setTitle(title);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 {
|
||||||
.getInstance()
|
.getInstance()
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.limit(ProvisionConstants.MAX_INSTANCES)
|
.limit(ModelHardLimits.MAX_INSTANCES)
|
||||||
.collect(Collectors.toList()));
|
.collect(Collectors.toList()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 {
|
||||||
List<ExternalReference> refs = r
|
List<ExternalReference> refs = r
|
||||||
.getExternalReference()
|
.getExternalReference()
|
||||||
.stream()
|
.stream()
|
||||||
.limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES)
|
.limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
r.setExternalReference(refs);
|
r.setExternalReference(refs);
|
||||||
}
|
}
|
||||||
if (r.getAuthor() != null) {
|
if (r.getAuthor() != null) {
|
||||||
List<Author> authors = Lists.newArrayList();
|
List<Author> authors = Lists.newArrayList();
|
||||||
for (Author a : r.getAuthor()) {
|
for (Author a : r.getAuthor()) {
|
||||||
a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH));
|
a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH));
|
||||||
if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) {
|
if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) {
|
||||||
authors.add(a);
|
authors.add(a);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.map(d -> {
|
.map(d -> {
|
||||||
d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH));
|
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
|
||||||
return d;
|
return d;
|
||||||
})
|
})
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
|
@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 {
|
||||||
.stream()
|
.stream()
|
||||||
.filter(Objects::nonNull)
|
.filter(Objects::nonNull)
|
||||||
.map(t -> {
|
.map(t -> {
|
||||||
t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH));
|
t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
|
||||||
return t;
|
return t;
|
||||||
})
|
})
|
||||||
.limit(ProvisionConstants.MAX_TITLES)
|
.limit(ModelHardLimits.MAX_TITLES)
|
||||||
.collect(Collectors.toList());
|
.collect(Collectors.toList());
|
||||||
r.setTitle(titles);
|
r.setTitle(titles);
|
||||||
}
|
}
|
||||||
|
|
|
@ -44,6 +44,7 @@
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
</ARCS>
|
</ARCS>
|
||||||
</NODE>
|
</NODE>
|
||||||
|
|
||||||
<NODE isStart="true" name="setRawGraphPath" type="SetEnvParameter">
|
<NODE isStart="true" name="setRawGraphPath" type="SetEnvParameter">
|
||||||
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
|
@ -54,31 +55,45 @@
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
</ARCS>
|
</ARCS>
|
||||||
</NODE>
|
</NODE>
|
||||||
|
|
||||||
|
<NODE isStart="true" name="setFirstCleanedGraphPath" type="SetEnvParameter">
|
||||||
|
<DESCRIPTION>Set the target path to store the first CLEANED graph</DESCRIPTION>
|
||||||
|
<PARAMETERS>
|
||||||
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">firstCleanedGraphPath</PARAM>
|
||||||
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_provision/graph/02_graph_first_cleaned</PARAM>
|
||||||
|
</PARAMETERS>
|
||||||
|
<ARCS>
|
||||||
|
<ARC to="waitConfig"/>
|
||||||
|
</ARCS>
|
||||||
|
</NODE>
|
||||||
|
|
||||||
<NODE isStart="true" name="setDedupGraphPath" type="SetEnvParameter">
|
<NODE isStart="true" name="setDedupGraphPath" type="SetEnvParameter">
|
||||||
<DESCRIPTION>Set the target path to store the DEDUPED graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the DEDUPED graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">dedupGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">dedupGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/02_graph_dedup</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/03_graph_dedup</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
</ARCS>
|
</ARCS>
|
||||||
</NODE>
|
</NODE>
|
||||||
|
|
||||||
<NODE isStart="true" name="setInferredGraphPath" type="SetEnvParameter">
|
<NODE isStart="true" name="setInferredGraphPath" type="SetEnvParameter">
|
||||||
<DESCRIPTION>Set the target path to store the INFERRED graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the INFERRED graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">inferredGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">inferredGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/03_graph_inferred</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/04_graph_inferred</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
</ARCS>
|
</ARCS>
|
||||||
</NODE>
|
</NODE>
|
||||||
|
|
||||||
<NODE isStart="true" name="setConsistentGraphPath" type="SetEnvParameter">
|
<NODE isStart="true" name="setConsistentGraphPath" type="SetEnvParameter">
|
||||||
<DESCRIPTION>Set the target path to store the CONSISTENCY graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the CONSISTENCY graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">consistentGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">consistentGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/04_graph_consistent</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_consistent</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -89,7 +104,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_orcid</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_orcid</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -100,7 +115,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the BULK TAGGED graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the BULK TAGGED graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">bulkTaggingGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">bulkTaggingGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_bulktagging</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_bulktagging</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -111,7 +126,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">affiliationGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">affiliationGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_affiliation</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_affiliation</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -122,7 +137,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the COMMUNITY from SELECTED SOURCES graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the COMMUNITY from SELECTED SOURCES graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">communityOrganizationGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">communityOrganizationGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_comunity_organization</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_comunity_organization</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -133,7 +148,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the FUNDING from SEMANTIC RELATION graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the FUNDING from SEMANTIC RELATION graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">fundingGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">fundingGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_funding</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_funding</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -144,7 +159,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the COMMUNITY from SEMANTIC RELATION graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the COMMUNITY from SEMANTIC RELATION graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">communitySemRelGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">communitySemRelGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_comunity_sem_rel</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_comunity_sem_rel</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -155,7 +170,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the COUNTRY enriched graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the COUNTRY enriched graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">countryGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">countryGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_country</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_country</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -166,7 +181,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_cleaned</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/13_graph_cleaned</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -177,7 +192,7 @@
|
||||||
<DESCRIPTION>Set the target path to store the blacklisted graph</DESCRIPTION>
|
<DESCRIPTION>Set the target path to store the blacklisted graph</DESCRIPTION>
|
||||||
<PARAMETERS>
|
<PARAMETERS>
|
||||||
<PARAM managedBy="system" name="parameterName" required="true" type="string">blacklistedGraphPath</PARAM>
|
<PARAM managedBy="system" name="parameterName" required="true" type="string">blacklistedGraphPath</PARAM>
|
||||||
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/13_graph_blacklisted</PARAM>
|
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/14_graph_blacklisted</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="waitConfig"/>
|
<ARC to="waitConfig"/>
|
||||||
|
@ -324,6 +339,31 @@
|
||||||
</PARAM>
|
</PARAM>
|
||||||
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
||||||
</PARAMETERS>
|
</PARAMETERS>
|
||||||
|
<ARCS>
|
||||||
|
<ARC to="graphCleaningFirst"/>
|
||||||
|
</ARCS>
|
||||||
|
</NODE>
|
||||||
|
|
||||||
|
<NODE name="graphCleaningFirst" type="SubmitHadoopJob">
|
||||||
|
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
|
||||||
|
<PARAMETERS>
|
||||||
|
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
|
||||||
|
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
|
||||||
|
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||||
|
{
|
||||||
|
'graphInputPath' : 'rawGraphPath',
|
||||||
|
'graphOutputPath': 'firstCleanedGraphPath',
|
||||||
|
'isLookupUrl': 'isLookUpUrl'
|
||||||
|
}
|
||||||
|
</PARAM>
|
||||||
|
<PARAM managedBy="system" name="params" required="true" type="string">
|
||||||
|
{
|
||||||
|
'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app',
|
||||||
|
'workingPath' : '/tmp/beta_provision/working_dir/first_clean'
|
||||||
|
}
|
||||||
|
</PARAM>
|
||||||
|
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
|
||||||
|
</PARAMETERS>
|
||||||
<ARCS>
|
<ARCS>
|
||||||
<ARC to="duplicateScan"/>
|
<ARC to="duplicateScan"/>
|
||||||
</ARCS>
|
</ARCS>
|
||||||
|
@ -337,7 +377,7 @@
|
||||||
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
<PARAM managedBy="system" name="envParams" required="true" type="string">
|
||||||
{
|
{
|
||||||
'actionSetId' : 'dedupConfig',
|
'actionSetId' : 'dedupConfig',
|
||||||
'graphBasePath' : 'rawGraphPath',
|
'graphBasePath' : 'firstCleanedGraphPath',
|
||||||
'dedupGraphPath': 'dedupGraphPath',
|
'dedupGraphPath': 'dedupGraphPath',
|
||||||
'isLookUpUrl' : 'isLookUpUrl'
|
'isLookUpUrl' : 'isLookUpUrl'
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue