forked from D-Net/dnet-hadoop
Merge branch 'stable_ids' into deduptesting
commit
3f2d3253e4
@ -1,14 +1,14 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.provision;
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
public class ProvisionConstants {
|
||||
public class ModelHardLimits {
|
||||
|
||||
public static final int MAX_EXTERNAL_ENTITIES = 50;
|
||||
public static final int MAX_AUTHORS = 200;
|
||||
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
|
||||
public static final int MAX_TITLE_LENGTH = 5000;
|
||||
public static final int MAX_TITLES = 10;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 100000;
|
||||
public static final int MAX_ABSTRACT_LENGTH = 150000;
|
||||
public static final int MAX_INSTANCES = 10;
|
||||
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
|
||||
public class ResultTypeComparator implements Comparator<Result> {
|
||||
|
||||
@Override
|
||||
public int compare(Result left, Result right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
String lClass = left.getResulttype().getClassid();
|
||||
String rClass = right.getResulttype().getClassid();
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return -1;
|
||||
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
|
||||
return 1;
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
}
|
@ -0,0 +1,102 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
/**
|
||||
* Factory class for OpenAIRE identifiers in the Graph
|
||||
*/
|
||||
public class IdentifierFactory implements Serializable {
|
||||
|
||||
public static final String ID_SEPARATOR = "::";
|
||||
public static final String ID_PREFIX_SEPARATOR = "|";
|
||||
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
|
||||
+ "[a-zA-Z0-9]{32}$";
|
||||
|
||||
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
|
||||
"(^10\\.1002\\/[^\\s]+$)|" +
|
||||
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
|
||||
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
|
||||
|
||||
public static final int ID_PREFIX_LEN = 12;
|
||||
public static final String NONE = "none";
|
||||
|
||||
/**
|
||||
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
|
||||
* when no PID is available
|
||||
* @param entity the entity providing PIDs and a default ID.
|
||||
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
|
||||
* @return an identifier from the most relevant PID, entity.id otherwise
|
||||
*/
|
||||
public static <T extends OafEntity> String createIdentifier(T entity) {
|
||||
|
||||
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
|
||||
return entity.getId();
|
||||
}
|
||||
|
||||
return entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(s -> pidFilter(s))
|
||||
.min(new PidComparator<>(entity))
|
||||
.map(s -> idFromPid(entity, s))
|
||||
.map(IdentifierFactory::verifyIdSyntax)
|
||||
.orElseGet(entity::getId);
|
||||
}
|
||||
|
||||
protected static boolean pidFilter(StructuredProperty s) {
|
||||
if (Objects.isNull(s.getQualifier()) ||
|
||||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
switch (PidType.valueOf(s.getQualifier().getClassid())) {
|
||||
case doi:
|
||||
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
|
||||
return doi.matches(DOI_REGEX);
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
} catch (IllegalArgumentException e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static String verifyIdSyntax(String s) {
|
||||
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
|
||||
throw new RuntimeException(String.format("malformed id: '%s'", s));
|
||||
} else {
|
||||
return s;
|
||||
}
|
||||
}
|
||||
|
||||
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
|
||||
return new StringBuilder()
|
||||
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
|
||||
.append(ID_PREFIX_SEPARATOR)
|
||||
.append(createPrefix(s.getQualifier().getClassid()))
|
||||
.append(ID_SEPARATOR)
|
||||
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
|
||||
.toString();
|
||||
}
|
||||
|
||||
// create the prefix (length = 12)
|
||||
private static String createPrefix(String pidType) {
|
||||
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
|
||||
while (prefix.length() < ID_PREFIX_LEN) {
|
||||
prefix.append("_");
|
||||
}
|
||||
return prefix.substring(0, ID_PREFIX_LEN);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,27 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class OrganizationPidComparator implements Comparator<PidType> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.GRID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GRID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.mag_id))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.mag_id))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,54 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
|
||||
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
|
||||
|
||||
private T entity;
|
||||
|
||||
public PidComparator(T entity) {
|
||||
this.entity = entity;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compare(StructuredProperty left, StructuredProperty right) {
|
||||
|
||||
if (left == null && right == null)
|
||||
return 0;
|
||||
if (left == null)
|
||||
return 1;
|
||||
if (right == null)
|
||||
return -1;
|
||||
|
||||
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
|
||||
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
|
||||
|
||||
if (lClass.equals(rClass))
|
||||
return 0;
|
||||
|
||||
if (ModelSupport.isSubClass(entity, Result.class)) {
|
||||
return compareResultPids(lClass, rClass);
|
||||
}
|
||||
if (ModelSupport.isSubClass(entity, Organization.class)) {
|
||||
return compareOrganizationtPids(lClass, rClass);
|
||||
}
|
||||
|
||||
// Else (but unlikely), lexicographical ordering will do.
|
||||
return lClass.compareTo(rClass);
|
||||
}
|
||||
|
||||
private int compareResultPids(PidType lClass, PidType rClass) {
|
||||
return new ResultPidComparator().compare(lClass, rClass);
|
||||
}
|
||||
|
||||
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
|
||||
return new OrganizationPidComparator().compare(lClass, rClass);
|
||||
}
|
||||
}
|
@ -0,0 +1,29 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import org.apache.commons.lang3.EnumUtils;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
// Result
|
||||
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
|
||||
|
||||
// Organization
|
||||
GRID, mag_id, urn,
|
||||
|
||||
// Used by dedup
|
||||
undefined, original;
|
||||
|
||||
public static boolean isValid(String type) {
|
||||
return EnumUtils.isValidEnum(PidType.class, type);
|
||||
}
|
||||
|
||||
public static PidType tryValueOf(String s) {
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
} catch (Exception e) {
|
||||
return PidType.original;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
public class ResultPidComparator implements Comparator<PidType> {
|
||||
|
||||
@Override
|
||||
public int compare(PidType pLeft, PidType pRight) {
|
||||
if (pLeft.equals(PidType.doi))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.doi))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmid))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmid))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.pmc))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.pmc))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.handle))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.handle))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.arXiv))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.arXiv))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.NCID))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.NCID))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.GBIF))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.GBIF))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.nct))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.nct))
|
||||
return 1;
|
||||
|
||||
if (pLeft.equals(PidType.urn))
|
||||
return -1;
|
||||
if (pRight.equals(PidType.urn))
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,47 @@
|
||||
|
||||
package eu.dnetlib.dhp.schema.oaf.utils;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import com.fasterxml.jackson.databind.DeserializationFeature;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class IdentifierFactoryTest {
|
||||
|
||||
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
|
||||
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
|
||||
|
||||
@Test
|
||||
public void testCreateIdentifierForPublication() throws IOException {
|
||||
|
||||
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
|
||||
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
|
||||
verifyIdentifier(
|
||||
"publication_urn.json",
|
||||
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
|
||||
|
||||
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
|
||||
verifyIdentifier("publication_3.json", defaultID);
|
||||
verifyIdentifier("publication_4.json", defaultID);
|
||||
verifyIdentifier("publication_5.json", defaultID);
|
||||
}
|
||||
|
||||
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
|
||||
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
|
||||
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
|
||||
|
||||
String id = IdentifierFactory.createIdentifier(pub);
|
||||
|
||||
assertNotNull(id);
|
||||
assertEquals(expectedID, id);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}
|
@ -0,0 +1 @@
|
||||
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]}
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,124 +1,46 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.*;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import static org.apache.commons.lang3.StringUtils.substringAfter;
|
||||
import static org.apache.commons.lang3.StringUtils.substringBefore;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import java.io.Serializable;
|
||||
import java.util.List;
|
||||
|
||||
import eu.dnetlib.dhp.oa.dedup.model.Identifier;
|
||||
import eu.dnetlib.dhp.oa.dedup.model.PidType;
|
||||
import eu.dnetlib.dhp.schema.common.EntityType;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.OafEntity;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
|
||||
|
||||
public class IdGenerator implements Serializable {
|
||||
|
||||
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
|
||||
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
|
||||
public static String BASE_DATE = "2000-01-01";
|
||||
|
||||
// pick the best pid from the list (consider date and pidtype)
|
||||
public static String generate(List<Identifier> pids, String defaultID) {
|
||||
public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
|
||||
if (pids == null || pids.size() == 0)
|
||||
return defaultID;
|
||||
|
||||
Optional<Identifier> bp = pids
|
||||
Identifier<T> bp = pids
|
||||
.stream()
|
||||
.max(Identifier::compareTo);
|
||||
.min(Identifier::compareTo)
|
||||
.get();
|
||||
|
||||
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::"
|
||||
+ DHPUtils.md5(bp.get().getOriginalID());
|
||||
} else {
|
||||
return bp.get().getOriginalID().split("\\|")[0] + "|"
|
||||
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
|
||||
+ DHPUtils.md5(bp.get().getPid().getValue());
|
||||
}
|
||||
|
||||
}
|
||||
String prefix = substringBefore(bp.getOriginalID(), "|");
|
||||
String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
|
||||
String suffix = substringAfter(bp.getOriginalID(), "::");
|
||||
|
||||
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) {
|
||||
|
||||
Date date;
|
||||
try {
|
||||
date = sdf.parse(BASE_DATE);
|
||||
} catch (ParseException e) {
|
||||
date = new Date();
|
||||
final String pidType = substringBefore(ns, "_");
|
||||
if (PidType.isValid(pidType)) {
|
||||
return prefix + "|" + dedupify(ns) + "::" + suffix;
|
||||
} else {
|
||||
return prefix + "|dedup_wf_001::" + suffix;
|
||||
}
|
||||
return Lists
|
||||
.newArrayList(
|
||||
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
|
||||
EntityType.fromClass(entity.getClass()), entity.getId()));
|
||||
}
|
||||
|
||||
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
|
||||
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
|
||||
|
||||
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
|
||||
|
||||
if (entity.getPid() == null || entity.getPid().size() == 0)
|
||||
return createBasePid(entity, sdf);
|
||||
|
||||
Optional<StructuredProperty> bp = entity
|
||||
.getPid()
|
||||
.stream()
|
||||
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
|
||||
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
|
||||
|
||||
return bp
|
||||
.map(
|
||||
structuredProperty -> Lists
|
||||
.newArrayList(
|
||||
new Identifier(structuredProperty, extractDate(entity, sdf),
|
||||
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
|
||||
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
|
||||
.orElseGet(() -> createBasePid(entity, sdf));
|
||||
|
||||
}
|
||||
|
||||
// create the prefix (length = 12): dedup_+ pidType
|
||||
public static String createPrefix(String pidType) {
|
||||
|
||||
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
|
||||
|
||||
private static String dedupify(String ns) {
|
||||
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
|
||||
while (prefix.length() < 12) {
|
||||
prefix.append("_");
|
||||
}
|
||||
return prefix.toString().substring(0, 12);
|
||||
|
||||
}
|
||||
|
||||
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
|
||||
// 00-01-01
|
||||
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
|
||||
|
||||
String date = BASE_DATE;
|
||||
if (ModelSupport.isSubClass(duplicate, Result.class)) {
|
||||
Result result = (Result) duplicate;
|
||||
if (isWellformed(result.getDateofacceptance())) {
|
||||
date = result.getDateofacceptance().getValue();
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
return sdf.parse(date);
|
||||
} catch (ParseException e) {
|
||||
return new Date();
|
||||
}
|
||||
|
||||
return prefix.substring(0, 12);
|
||||
}
|
||||
|
||||
public static boolean isWellformed(Field<String> date) {
|
||||
return date != null && StringUtils.isNotBlank(date.getValue())
|
||||
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
|
||||
}
|
||||
}
|
||||
|
@ -1,17 +0,0 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.dedup.model;
|
||||
|
||||
public enum PidType {
|
||||
|
||||
// from the less to the more important
|
||||
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
|
||||
|
||||
public static PidType classidValueOf(String s) {
|
||||
try {
|
||||
return PidType.valueOf(s);
|
||||
} catch (Exception e) {
|
||||
return PidType.undefined;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,3 @@
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
@ -0,0 +1,3 @@
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
||||
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,4 +1,5 @@
|
||||
[
|
||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true}
|
||||
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true},
|
||||
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the work dir path", "paramRequired": true}
|
||||
]
|
@ -0,0 +1,99 @@
|
||||
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.Mockito.lenient;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.schema.common.ModelConstants;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class GenerateEntitiesApplicationTest {
|
||||
|
||||
@Mock
|
||||
private ISLookUpService isLookUpService;
|
||||
|
||||
@Mock
|
||||
private VocabularyGroup vocs;
|
||||
|
||||
@BeforeEach
|
||||
public void setUp() throws IOException, ISLookUpException {
|
||||
|
||||
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
|
||||
lenient()
|
||||
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
|
||||
.thenReturn(synonyms());
|
||||
|
||||
vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMergeResult() throws IOException {
|
||||
Result publication = getResult("oaf_record.xml", Publication.class);
|
||||
Result dataset = getResult("odf_dataset.xml", Dataset.class);
|
||||
Result software = getResult("odf_software.xml", Software.class);
|
||||
Result orp = getResult("oaf_orp.xml", OtherResearchProduct.class);
|
||||
|
||||
verifyMerge(publication, dataset, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
verifyMerge(dataset, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
|
||||
verifyMerge(publication, software, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
verifyMerge(software, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
|
||||
verifyMerge(publication, orp, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
verifyMerge(orp, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
|
||||
|
||||
verifyMerge(dataset, software, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||
verifyMerge(software, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||
|
||||
verifyMerge(dataset, orp, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||
verifyMerge(orp, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
|
||||
|
||||
verifyMerge(software, orp, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
|
||||
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
|
||||
}
|
||||
|
||||
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
|
||||
String resultType) {
|
||||
final Result merge = GenerateEntitiesApplication.mergeResults(publication, dataset);
|
||||
assertTrue(clazz.isAssignableFrom(merge.getClass()));
|
||||
assertEquals(resultType, merge.getResulttype().getClassid());
|
||||
}
|
||||
|
||||
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
|
||||
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
|
||||
return new OdfToOafMapper(vocs, false)
|
||||
.processMdRecord(xml)
|
||||
.stream()
|
||||
.filter(s -> clazz.isAssignableFrom(s.getClass()))
|
||||
.map(s -> (Result) s)
|
||||
.findFirst()
|
||||
.get();
|
||||
}
|
||||
|
||||
private List<String> vocs() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
|
||||
}
|
||||
|
||||
private List<String> synonyms() throws IOException {
|
||||
return IOUtils
|
||||
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,83 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
|
||||
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
|
||||
xmlns:oaf="http://namespace.openaire.eu/oaf"
|
||||
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
|
||||
<header xmlns="http://namespace.openaire.eu/">
|
||||
<dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
|
||||
<dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
|
||||
<dri:dateOfCollection/>
|
||||
<dri:mdFormat/>
|
||||
<dri:mdFormatInterpretation/>
|
||||
<dri:repositoryId/>
|
||||
<dr:objectIdentifier/>
|
||||
<dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
|
||||
<dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
|
||||
<oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
|
||||
</header>
|
||||
<metadata xmlns="http://namespace.openaire.eu/">
|
||||
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
|
||||
<dc:creator>Nikolaidou,Charitini</dc:creator>
|
||||
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
|
||||
<dc:creator>Sgardelis,Steanos</dc:creator>
|
||||
<dc:creator>Halley,John</dc:creator>
|
||||
<dc:creator>Pantis,John</dc:creator>
|
||||
<dc:creator>Tsiafouli,Maria</dc:creator>
|
||||
<dc:date>2017</dc:date>
|
||||
<dc:description>The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management.</dc:description>
|
||||
<dc:format>text/html</dc:format>
|
||||
<dc:identifier>https://doi.org/10.3897/oneeco.2.e13718</dc:identifier>
|
||||
<dc:identifier>https://oneecosystem.pensoft.net/article/13718/</dc:identifier>
|
||||
<dc:language>eng</dc:language>
|
||||
<dc:publisher>Pensoft Publishers</dc:publisher>
|
||||
<dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
|
||||
<dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:source>One Ecosystem 2: e13718</dc:source>
|
||||
<dc:subject>Ecosystem Services hotspots</dc:subject>
|
||||
<dc:subject>Natura 2000</dc:subject>
|
||||
<dc:subject>Quiet Protected Areas</dc:subject>
|
||||
<dc:subject>Biodiversity</dc:subject>
|
||||
<dc:subject>Agriculture</dc:subject>
|
||||
<dc:subject>Elevation</dc:subject>
|
||||
<dc:subject>Slope</dc:subject>
|
||||
<dc:subject>Ecosystem Service trade-offs and synergies</dc:subject>
|
||||
<dc:subject> cultural services</dc:subject>
|
||||
<dc:subject>provisioning services</dc:subject>
|
||||
<dc:subject>regulating services</dc:subject>
|
||||
<dc:subject>supporting services</dc:subject>
|
||||
<dc:type>Research Artefact</dc:type>
|
||||
<dr:CobjCategory type="other">0020</dr:CobjCategory>
|
||||
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
|
||||
<oaf:projectid>corda_______::226852</oaf:projectid>
|
||||
<oaf:accessrights>OPEN</oaf:accessrights>
|
||||
<oaf:hostedBy id="openaire____::issn226852" name="One Ecosystem"/>
|
||||
<oaf:collectedFrom
|
||||
id="openaire____::45e3c7b69bcee6cc5fa945c9e183deb9" name="Pensoft"/>
|
||||
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
|
||||
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
|
||||
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
|
||||
<oaf:refereed>0001</oaf:refereed>
|
||||
</metadata>
|
||||
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
|
||||
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
|
||||
<originDescription altered="true" harvestDate="2020-03-23T00:20:51.392Z">
|
||||
<baseURL>http%3A%2F%2Fzookeys.pensoft.net%2Foai.php</baseURL>
|
||||
<identifier>10.3897/oneeco.2.e13718</identifier>
|
||||
<datestamp>2017-09-08</datestamp>
|
||||
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
|
||||
</originDescription>
|
||||
</provenance>
|
||||
<oaf:datainfo>
|
||||
<oaf:inferred>false</oaf:inferred>
|
||||
<oaf:deletedbyinference>false</oaf:deletedbyinference>
|
||||
<oaf:trust>0.9</oaf:trust>
|
||||
<oaf:inferenceprovenance/>
|
||||
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
|
||||
classname="sysimport:crosswalk:repository"
|
||||
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
|
||||
</oaf:datainfo>
|
||||
</about>
|
||||
</record>
|
@ -0,0 +1,111 @@
|
||||
|
||||
package eu.dnetlib.dhp.export.zenodo;
|
||||
|
||||
import java.io.*;
|
||||
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
|
||||
public class MakeTar implements Serializable {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
String jsonConfiguration = IOUtils
|
||||
.toString(
|
||||
MakeTar.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
|
||||
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String outputPath = parser.get("targetPath");
|
||||
log.info("hdfsPath: {}", outputPath);
|
||||
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
log.info("nameNode: {}", hdfsNameNode);
|
||||
|
||||
final String inputPath = parser.get("sourcePath");
|
||||
log.info("input path : {}", inputPath);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
makeTArArchive(fileSystem, inputPath, outputPath);
|
||||
|
||||
}
|
||||
|
||||
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
|
||||
|
||||
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
|
||||
|
||||
while (dir_iterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = dir_iterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
|
||||
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
|
||||
throws IOException {
|
||||
|
||||
Path hdfsWritePath = new Path(outputPath);
|
||||
FSDataOutputStream fsDataOutputStream = null;
|
||||
if (fileSystem.exists(hdfsWritePath)) {
|
||||
fileSystem.delete(hdfsWritePath, true);
|
||||
|
||||
}
|
||||
fsDataOutputStream = fileSystem.create(hdfsWritePath);
|
||||
|
||||
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
new Path(inputPath), true);
|
||||
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
|
||||
entry.setSize(fileStatus.getLen());
|
||||
ar.putArchiveEntry(entry);
|
||||
|
||||
InputStream is = fileSystem.open(fileStatus.getPath());
|
||||
|
||||
BufferedInputStream bis = new BufferedInputStream(is);
|
||||
|
||||
int count;
|
||||
byte data[] = new byte[1024];
|
||||
while ((count = bis.read(data, 0, data.length)) != -1) {
|
||||
ar.write(data, 0, count);
|
||||
}
|
||||
bis.close();
|
||||
ar.closeArchiveEntry();
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
ar.close();
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
|
||||
package eu.dnetlib.dhp.export.zenodo;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Optional;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.*;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
|
||||
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
|
||||
|
||||
public class SendToZenodoHDFS implements Serializable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
SendToZenodoHDFS.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/export/upload_zenodo.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
final String hdfsNameNode = parser.get("nameNode");
|
||||
final String access_token = parser.get("accessToken");
|
||||
final String connection_url = parser.get("connectionUrl");
|
||||
final String metadata = parser.get("metadata");
|
||||
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
|
||||
final String concept_rec_id = Optional
|
||||
.ofNullable(parser.get("conceptRecordId"))
|
||||
.orElse(null);
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
conf.set("fs.defaultFS", hdfsNameNode);
|
||||
|
||||
FileSystem fileSystem = FileSystem.get(conf);
|
||||
|
||||
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
|
||||
.listFiles(
|
||||
new Path(hdfsPath), true);
|
||||
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
|
||||
if (newDeposition) {
|
||||
zenodoApiClient.newDeposition();
|
||||
} else {
|
||||
if (concept_rec_id == null) {
|
||||
throw new MissingConceptDoiException("No concept record id has been provided");
|
||||
}
|
||||
zenodoApiClient.newVersion(concept_rec_id);
|
||||
}
|
||||
|
||||
while (fileStatusListIterator.hasNext()) {
|
||||
LocatedFileStatus fileStatus = fileStatusListIterator.next();
|
||||
|
||||
Path p = fileStatus.getPath();
|
||||
String p_string = p.toString();
|
||||
if (!p_string.endsWith("_SUCCESS")) {
|
||||
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
|
||||
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
|
||||
log.info("Sending information for community: " + name);
|
||||
FSDataInputStream inputStream = fileSystem.open(p);
|
||||
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
zenodoApiClient.sendMretadata(metadata);
|
||||
zenodoApiClient.publish();
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,20 @@
|
||||
[
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the Name Node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
||||
"paramDescription": "the target path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
@ -0,0 +1,45 @@
|
||||
|
||||
[
|
||||
{
|
||||
"paramName":"nd",
|
||||
"paramLongName":"newDeposition",
|
||||
"paramDescription": "if it is a new deposition (true) or a new version (false)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName":"cri",
|
||||
"paramLongName":"conceptRecordId",
|
||||
"paramDescription": "The id of the concept record for a new version",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"hdfsp",
|
||||
"paramLongName":"hdfsPath",
|
||||
"paramDescription": "the path of the folder tofind files to send to Zenodo",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "nn",
|
||||
"paramLongName": "nameNode",
|
||||
"paramDescription": "the name node",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "at",
|
||||
"paramLongName": "accessToken",
|
||||
"paramDescription": "the access token for the deposition",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"cu",
|
||||
"paramLongName":"connectionUrl",
|
||||
"paramDescription": "the url to connect to deposit",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName":"m",
|
||||
"paramLongName":"metadata",
|
||||
"paramDescription": "metadata associated to the deposition",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
@ -0,0 +1,48 @@
|
||||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.wf.rerun.failnodes</name>
|
||||
<value>false</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>hive_metastore_uris</name>
|
||||
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<value>/user/spark/spark2ApplicationHistory</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>oozie.launcher.mapreduce.user.classpath.first</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
|
||||
</configuration>
|
@ -0,0 +1,53 @@
|
||||
<workflow-app name="Send Dump to Zenodo" xmlns="uri:oozie:workflow:0.5">
|
||||
<parameters>
|
||||
<property>
|
||||
<name>sourcePath</name>
|
||||
<description>the source path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>targetPath</name>
|
||||
<description>the target path</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>metadata</name>
|
||||
<description>the metadata</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<start to="send_zenodo"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="MakeTar">
|
||||
<java>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<main-class>eu.dnetlib.dhp.export.zenodo.MakeTar</main-class>
|
||||
<arg>-t</arg><arg>${targetPath}</arg>
|
||||
<arg>-n</arg><arg>${nameNode}</arg>
|
||||
<arg>-s</arg><arg>${sourcePath}</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<action name="send_zenodo">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS</main-class>
|
||||
<arg>--hdfsPath</arg><arg>/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar</arg>
|
||||
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||
<arg>--accessToken</arg><arg>b6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv</arg>
|
||||
<arg>--connectionUrl</arg><arg>https://zenodo.org/api/deposit/depositions</arg>
|
||||
<arg>--metadata</arg><arg>${metadata}</arg>
|
||||
<arg>--conceptRecordId</arg><arg>1200252</arg>
|
||||
<arg>--newDeposition</arg><arg>false</arg>
|
||||
</java>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue