1
0
Fork 0

Merge branch 'stable_ids' into deduptesting

This commit is contained in:
miconis 2020-11-05 15:52:57 +01:00
commit 3f2d3253e4
73 changed files with 1713 additions and 692 deletions

View File

@ -29,6 +29,12 @@
<artifactId>spark-sql_2.11</artifactId> <artifactId>spark-sql_2.11</artifactId>
</dependency> </dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency> <dependency>
<groupId>commons-cli</groupId> <groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId> <artifactId>commons-cli</artifactId>

View File

@ -94,7 +94,13 @@ public class AuthorMerger {
if (r.getPid() == null) { if (r.getPid() == null) {
r.setPid(new ArrayList<>()); r.setPid(new ArrayList<>());
} }
r.getPid().add(a._1());
// TERRIBLE HACK but for some reason when we create and Array with Arrays.asList,
// it creates of fixed size, and the add method raise UnsupportedOperationException at
// java.util.AbstractList.add
final List<StructuredProperty> tmp = new ArrayList<>(r.getPid());
tmp.add(a._1());
r.setPid(tmp);
} }
} }
}); });

View File

@ -1,8 +1,9 @@
package eu.dnetlib.dhp.oa.graph.clean; package eu.dnetlib.dhp.schema.oaf;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.Objects; import java.util.Objects;
import java.util.Optional;
import java.util.function.Function; import java.util.function.Function;
import java.util.stream.Collectors; import java.util.stream.Collectors;
@ -10,14 +11,13 @@ import org.apache.commons.lang3.StringUtils;
import com.clearspring.analytics.util.Lists; import com.clearspring.analytics.util.Lists;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
public class CleaningFunctions { public class CleaningFunctions {
public static final String DOI_URL_PREFIX_REGEX = "(^http(s?):\\/\\/)(((dx\\.)?doi\\.org)|(handle\\.test\\.datacite\\.org))\\/";
public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/"; public static final String ORCID_PREFIX_REGEX = "^http(s?):\\/\\/orcid\\.org\\/";
public static final String NONE = "none";
public static <T extends Oaf> T fixVocabularyNames(T value) { public static <T extends Oaf> T fixVocabularyNames(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
@ -71,7 +71,7 @@ public class CleaningFunctions {
return value; return value;
} }
protected static <T extends Oaf> T fixDefaults(T value) { public static <T extends Oaf> T fixDefaults(T value) {
if (value instanceof Datasource) { if (value instanceof Datasource) {
// nothing to clean here // nothing to clean here
} else if (value instanceof Project) { } else if (value instanceof Project) {
@ -106,6 +106,20 @@ public class CleaningFunctions {
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid())) .filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }
if (Objects.nonNull(r.getPid())) {
r
.setPid(
r
.getPid()
.stream()
.filter(Objects::nonNull)
.filter(sp -> StringUtils.isNotBlank(StringUtils.trim(sp.getValue())))
.filter(sp -> NONE.equalsIgnoreCase(sp.getValue()))
.filter(sp -> Objects.nonNull(sp.getQualifier()))
.filter(sp -> StringUtils.isNotBlank(sp.getQualifier().getClassid()))
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList()));
}
if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) { if (Objects.isNull(r.getResourcetype()) || StringUtils.isBlank(r.getResourcetype().getClassid())) {
r r
.setResourcetype( .setResourcetype(
@ -125,7 +139,7 @@ public class CleaningFunctions {
} }
} }
if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) { if (Objects.isNull(r.getBestaccessright()) || StringUtils.isBlank(r.getBestaccessright().getClassid())) {
Qualifier bestaccessrights = AbstractMdRecordToOafMapper.createBestAccessRights(r.getInstance()); Qualifier bestaccessrights = OafMapperUtils.createBestAccessRights(r.getInstance());
if (Objects.isNull(bestaccessrights)) { if (Objects.isNull(bestaccessrights)) {
r r
.setBestaccessright( .setBestaccessright(
@ -201,4 +215,24 @@ public class CleaningFunctions {
classid, classname, scheme, scheme); classid, classname, scheme, scheme);
} }
/**
* Utility method that normalises PID values on a per-type basis.
* @param pid the PID whose value will be normalised.
* @return the PID containing the normalised value.
*/
public static StructuredProperty normalizePidValue(StructuredProperty pid) {
String value = Optional
.ofNullable(pid.getValue())
.map(String::trim)
.orElseThrow(() -> new IllegalArgumentException("PID value cannot be empty"));
switch (pid.getQualifier().getClassid()) {
// TODO add cleaning for more PID types as needed
case "doi":
pid.setValue(value.toLowerCase().replaceAll(DOI_URL_PREFIX_REGEX, ""));
break;
}
return pid;
}
} }

View File

@ -1,14 +1,14 @@
package eu.dnetlib.dhp.oa.provision; package eu.dnetlib.dhp.schema.oaf;
public class ProvisionConstants { public class ModelHardLimits {
public static final int MAX_EXTERNAL_ENTITIES = 50; public static final int MAX_EXTERNAL_ENTITIES = 50;
public static final int MAX_AUTHORS = 200; public static final int MAX_AUTHORS = 200;
public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000; public static final int MAX_AUTHOR_FULLNAME_LENGTH = 1000;
public static final int MAX_TITLE_LENGTH = 5000; public static final int MAX_TITLE_LENGTH = 5000;
public static final int MAX_TITLES = 10; public static final int MAX_TITLES = 10;
public static final int MAX_ABSTRACT_LENGTH = 100000; public static final int MAX_ABSTRACT_LENGTH = 150000;
public static final int MAX_INSTANCES = 10; public static final int MAX_INSTANCES = 10;
} }

View File

@ -1,11 +1,10 @@
package eu.dnetlib.dhp.oa.graph.raw.common; package eu.dnetlib.dhp.schema.oaf;
import java.util.ArrayList; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import java.util.Arrays; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_ACCESS_MODES;
import java.util.List;
import java.util.Map; import java.util.*;
import java.util.Objects;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import java.util.function.Function; import java.util.function.Function;
import java.util.function.Predicate; import java.util.function.Predicate;
@ -13,15 +12,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils; import eu.dnetlib.dhp.utils.DHPUtils;
public class OafMapperUtils { public class OafMapperUtils {
@ -270,4 +261,36 @@ public class OafMapperUtils {
final Map<Object, Boolean> seen = new ConcurrentHashMap<>(); final Map<Object, Boolean> seen = new ConcurrentHashMap<>();
return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null; return t -> seen.putIfAbsent(keyExtractor.apply(t), Boolean.TRUE) == null;
} }
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
} }

View File

@ -0,0 +1,49 @@
package eu.dnetlib.dhp.schema.oaf;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelConstants;
public class ResultTypeComparator implements Comparator<Result> {
@Override
public int compare(Result left, Result right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
String lClass = left.getResulttype().getClassid();
String rClass = right.getResulttype().getClassid();
if (lClass.equals(rClass))
return 0;
if (lClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.PUBLICATION_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.DATASET_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.SOFTWARE_RESULTTYPE_CLASSID))
return 1;
if (lClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return -1;
if (rClass.equals(ModelConstants.ORP_RESULTTYPE_CLASSID))
return 1;
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
}

View File

@ -0,0 +1,102 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.io.Serializable;
import java.util.Objects;
import java.util.Optional;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringUtils;
import eu.dnetlib.dhp.schema.oaf.CleaningFunctions;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
/**
* Factory class for OpenAIRE identifiers in the Graph
*/
public class IdentifierFactory implements Serializable {
public static final String ID_SEPARATOR = "::";
public static final String ID_PREFIX_SEPARATOR = "|";
public final static String ID_REGEX = "^[0-9][0-9]\\" + ID_PREFIX_SEPARATOR + ".{12}" + ID_SEPARATOR
+ "[a-zA-Z0-9]{32}$";
public final static String DOI_REGEX = "(^10\\.[0-9]{4,9}\\/[-._;()\\/:a-zA-Z0-9]+$)|" +
"(^10\\.1002\\/[^\\s]+$)|" +
"(^10\\.1021\\/[a-zA-Z0-9_][a-zA-Z0-9_][0-9]++$)|" +
"(^10\\.1207\\/[a-zA-Z0-9_]+\\&[0-9]+_[0-9]+$)";
public static final int ID_PREFIX_LEN = 12;
public static final String NONE = "none";
/**
* Creates an identifier from the most relevant PID (if available) in the given entity T. Returns entity.id
* when no PID is available
* @param entity the entity providing PIDs and a default ID.
* @param <T> the specific entity type. Currently Organization and Result subclasses are supported.
* @return an identifier from the most relevant PID, entity.id otherwise
*/
public static <T extends OafEntity> String createIdentifier(T entity) {
if (Objects.isNull(entity.getPid()) || entity.getPid().isEmpty()) {
return entity.getId();
}
return entity
.getPid()
.stream()
.filter(s -> pidFilter(s))
.min(new PidComparator<>(entity))
.map(s -> idFromPid(entity, s))
.map(IdentifierFactory::verifyIdSyntax)
.orElseGet(entity::getId);
}
protected static boolean pidFilter(StructuredProperty s) {
if (Objects.isNull(s.getQualifier()) ||
StringUtils.isBlank(StringUtils.trim(s.getValue()))) {
return false;
}
try {
switch (PidType.valueOf(s.getQualifier().getClassid())) {
case doi:
final String doi = StringUtils.trim(StringUtils.lowerCase(s.getValue()));
return doi.matches(DOI_REGEX);
default:
return true;
}
} catch (IllegalArgumentException e) {
return false;
}
}
private static String verifyIdSyntax(String s) {
if (StringUtils.isBlank(s) || !s.matches(ID_REGEX)) {
throw new RuntimeException(String.format("malformed id: '%s'", s));
} else {
return s;
}
}
private static <T extends OafEntity> String idFromPid(T entity, StructuredProperty s) {
return new StringBuilder()
.append(StringUtils.substringBefore(entity.getId(), ID_PREFIX_SEPARATOR))
.append(ID_PREFIX_SEPARATOR)
.append(createPrefix(s.getQualifier().getClassid()))
.append(ID_SEPARATOR)
.append(DHPUtils.md5(CleaningFunctions.normalizePidValue(s).getValue()))
.toString();
}
// create the prefix (length = 12)
private static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder(StringUtils.left(pidType, ID_PREFIX_LEN));
while (prefix.length() < ID_PREFIX_LEN) {
prefix.append("_");
}
return prefix.substring(0, ID_PREFIX_LEN);
}
}

View File

@ -0,0 +1,27 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class OrganizationPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.GRID))
return -1;
if (pRight.equals(PidType.GRID))
return 1;
if (pLeft.equals(PidType.mag_id))
return -1;
if (pRight.equals(PidType.mag_id))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class PidComparator<T extends OafEntity> implements Comparator<StructuredProperty> {
private T entity;
public PidComparator(T entity) {
this.entity = entity;
}
@Override
public int compare(StructuredProperty left, StructuredProperty right) {
if (left == null && right == null)
return 0;
if (left == null)
return 1;
if (right == null)
return -1;
PidType lClass = PidType.valueOf(left.getQualifier().getClassid());
PidType rClass = PidType.valueOf(right.getQualifier().getClassid());
if (lClass.equals(rClass))
return 0;
if (ModelSupport.isSubClass(entity, Result.class)) {
return compareResultPids(lClass, rClass);
}
if (ModelSupport.isSubClass(entity, Organization.class)) {
return compareOrganizationtPids(lClass, rClass);
}
// Else (but unlikely), lexicographical ordering will do.
return lClass.compareTo(rClass);
}
private int compareResultPids(PidType lClass, PidType rClass) {
return new ResultPidComparator().compare(lClass, rClass);
}
private int compareOrganizationtPids(PidType lClass, PidType rClass) {
return new OrganizationPidComparator().compare(lClass, rClass);
}
}

View File

@ -0,0 +1,29 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import org.apache.commons.lang3.EnumUtils;
public enum PidType {
// Result
doi, pmid, pmc, handle, arXiv, NCID, GBIF, nct, pdb,
// Organization
GRID, mag_id, urn,
// Used by dedup
undefined, original;
public static boolean isValid(String type) {
return EnumUtils.isValidEnum(PidType.class, type);
}
public static PidType tryValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.original;
}
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import java.util.Comparator;
public class ResultPidComparator implements Comparator<PidType> {
@Override
public int compare(PidType pLeft, PidType pRight) {
if (pLeft.equals(PidType.doi))
return -1;
if (pRight.equals(PidType.doi))
return 1;
if (pLeft.equals(PidType.pmid))
return -1;
if (pRight.equals(PidType.pmid))
return 1;
if (pLeft.equals(PidType.pmc))
return -1;
if (pRight.equals(PidType.pmc))
return 1;
if (pLeft.equals(PidType.handle))
return -1;
if (pRight.equals(PidType.handle))
return 1;
if (pLeft.equals(PidType.arXiv))
return -1;
if (pRight.equals(PidType.arXiv))
return 1;
if (pLeft.equals(PidType.NCID))
return -1;
if (pRight.equals(PidType.NCID))
return 1;
if (pLeft.equals(PidType.GBIF))
return -1;
if (pRight.equals(PidType.GBIF))
return 1;
if (pLeft.equals(PidType.nct))
return -1;
if (pRight.equals(PidType.nct))
return 1;
if (pLeft.equals(PidType.urn))
return -1;
if (pRight.equals(PidType.urn))
return 1;
return 0;
}
}

View File

@ -0,0 +1,47 @@
package eu.dnetlib.dhp.schema.oaf.utils;
import static org.junit.jupiter.api.Assertions.*;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.Test;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.utils.DHPUtils;
public class IdentifierFactoryTest {
private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
@Test
public void testCreateIdentifierForPublication() throws IOException {
verifyIdentifier("publication_doi.json", "50|doi_________::" + DHPUtils.md5("10.1016/j.cmet.2011.03.013"));
verifyIdentifier("publication_pmc.json", "50|pmc_________::" + DHPUtils.md5("21459329"));
verifyIdentifier(
"publication_urn.json",
"50|urn_________::" + DHPUtils.md5("urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"));
final String defaultID = "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f";
verifyIdentifier("publication_3.json", defaultID);
verifyIdentifier("publication_4.json", defaultID);
verifyIdentifier("publication_5.json", defaultID);
}
protected void verifyIdentifier(String filename, String expectedID) throws IOException {
final String json = IOUtils.toString(getClass().getResourceAsStream(filename));
final Publication pub = OBJECT_MAPPER.readValue(json, Publication.class);
String id = IdentifierFactory.createIdentifier(pub);
assertNotNull(id);
assertEquals(expectedID, id);
}
}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"scp-number"},"value":"79953761260"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f"}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"doi"},"value":"10.1016/j.cmet.2011.03.013"},{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmc"},"value":"21459329"}]}

View File

@ -0,0 +1 @@
{"id":"50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f","pid":[{"qualifier":{"classid":"urn"},"value":"urn:nbn:nl:ui:29-f3ed5f9e-edf6-457e-8848-61b58a4075e2"},{"qualifier":{"classid":"scp-number"},"value":"79953761260"},{"qualifier":{"classid":"pmcid"},"value":"21459329"}]}

View File

@ -39,15 +39,15 @@ public class ModelConstants {
public static final String IS_SUPPLEMENT_TO = "isSupplementTo"; public static final String IS_SUPPLEMENT_TO = "isSupplementTo";
public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy"; public static final String IS_SUPPLEMENTED_BY = "isSupplementedBy";
public static final String PART = "part"; public static final String PART = "part";
public static final String IS_PART_OF = "IsPartOf"; public static final String IS_PART_OF = "isPartOf";
public static final String HAS_PARTS = "HasParts"; public static final String HAS_PARTS = "hasParts";
public static final String RELATIONSHIP = "relationship"; public static final String RELATIONSHIP = "relationship";
public static final String CITATION = "citation"; public static final String CITATION = "citation";
public static final String CITES = "cites"; public static final String CITES = "cites";
public static final String IS_CITED_BY = "IsCitedBy"; public static final String IS_CITED_BY = "isCitedBy";
public static final String REVIEW = "review"; public static final String REVIEW = "review";
public static final String REVIEWS = "reviews"; public static final String REVIEWS = "reviews";
public static final String IS_REVIEWED_BY = "IsReviewedBy"; public static final String IS_REVIEWED_BY = "isReviewedBy";
public static final String RESULT_PROJECT = "resultProject"; public static final String RESULT_PROJECT = "resultProject";
public static final String OUTCOME = "outcome"; public static final String OUTCOME = "outcome";

View File

@ -4,6 +4,7 @@ package eu.dnetlib.dhp.actionmanager.project;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Optional; import java.util.Optional;
@ -11,6 +12,7 @@ import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.*; import org.apache.spark.sql.*;
import org.slf4j.Logger; import org.slf4j.Logger;
@ -175,43 +177,54 @@ public class PrepareProgramme {
return csvProgramme; return csvProgramme;
}); });
prepareClassification(h2020Programmes); // prepareClassification(h2020Programmes);
h2020Programmes JavaSparkContext jsc = new JavaSparkContext(spark.sparkContext());
.map(csvProgramme -> OBJECT_MAPPER.writeValueAsString(csvProgramme))
JavaRDD<CSVProgramme> rdd = jsc.parallelize(prepareClassification(h2020Programmes), 1);
rdd
.map(csvProgramme -> {
String tmp = OBJECT_MAPPER.writeValueAsString(csvProgramme);
return tmp;
})
.saveAsTextFile(outputPath); .saveAsTextFile(outputPath);
} }
private static void prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) { private static List<CSVProgramme> prepareClassification(JavaRDD<CSVProgramme> h2020Programmes) {
Object[] codedescription = h2020Programmes Object[] codedescription = h2020Programmes
.map(value -> new Tuple2<>(value.getCode(), value.getTitle())) .map(
value -> new Tuple2<>(value.getCode(),
new Tuple2<String, String>(value.getTitle(), value.getShortTitle())))
.collect() .collect()
.toArray(); .toArray();
for (int i = 0; i < codedescription.length - 1; i++) { for (int i = 0; i < codedescription.length - 1; i++) {
for (int j = i + 1; j < codedescription.length; j++) { for (int j = i + 1; j < codedescription.length; j++) {
Tuple2<String, String> t2i = (Tuple2<String, String>) codedescription[i]; Tuple2<String, Tuple2<String, String>> t2i = (Tuple2<String, Tuple2<String, String>>) codedescription[i];
Tuple2<String, String> t2j = (Tuple2<String, String>) codedescription[j]; Tuple2<String, Tuple2<String, String>> t2j = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
if (t2i._1().compareTo(t2j._1()) > 0) { if (t2i._1().compareTo(t2j._1()) > 0) {
Tuple2<String, String> temp = t2i; Tuple2<String, Tuple2<String, String>> temp = t2i;
codedescription[i] = t2j; codedescription[i] = t2j;
codedescription[j] = temp; codedescription[j] = temp;
} }
} }
} }
Map<String, String> map = new HashMap<>(); Map<String, Tuple2<String, String>> map = new HashMap<>();
for (int j = 0; j < codedescription.length; j++) { for (int j = 0; j < codedescription.length; j++) {
Tuple2<String, String> entry = (Tuple2<String, String>) codedescription[j]; Tuple2<String, Tuple2<String, String>> entry = (Tuple2<String, Tuple2<String, String>>) codedescription[j];
String ent = entry._1(); String ent = entry._1();
if (ent.contains("Euratom-")) { if (ent.contains("Euratom-")) {
ent = ent.replace("-Euratom-", ".Euratom."); ent = ent.replace("-Euratom-", ".Euratom.");
} }
String[] tmp = ent.split("\\."); String[] tmp = ent.split("\\.");
if (tmp.length <= 2) { if (tmp.length <= 2) {
map.put(entry._1(), entry._2()); if (StringUtils.isEmpty(entry._2()._2())) {
map.put(entry._1(), new Tuple2<String, String>(entry._2()._1(), entry._2()._1()));
} else {
map.put(entry._1(), entry._2());
}
} else { } else {
if (ent.endsWith(".")) { if (ent.endsWith(".")) {
ent = ent.substring(0, ent.length() - 1); ent = ent.substring(0, ent.length() - 1);
@ -224,14 +237,14 @@ public class PrepareProgramme {
key = key.substring(0, key.length() - 1); key = key.substring(0, key.length() - 1);
} }
} }
String current = entry._2(); String current = entry._2()._1();
if (!ent.contains("Euratom")) { if (!ent.contains("Euratom")) {
String parent; String parent;
String tmp_key = tmp[0] + "."; String tmp_key = tmp[0] + ".";
for (int i = 1; i < tmp.length - 1; i++) { for (int i = 1; i < tmp.length - 1; i++) {
tmp_key += tmp[i] + "."; tmp_key += tmp[i] + ".";
parent = map.get(tmp_key).toLowerCase().trim(); parent = map.get(tmp_key)._1().toLowerCase().trim();
if (parent.contains("|")) { if (parent.contains("|")) {
parent = parent.substring(parent.lastIndexOf("|") + 1).trim(); parent = parent.substring(parent.lastIndexOf("|") + 1).trim();
} }
@ -246,18 +259,29 @@ public class PrepareProgramme {
} }
} }
map.put(ent + ".", map.get(key) + " | " + current); String shortTitle = entry._2()._2();
if (StringUtils.isEmpty(shortTitle)) {
shortTitle = current;
}
Tuple2<String, String> newEntry = new Tuple2<>(map.get(key)._1() + " | " + current,
map.get(key)._2() + " | " + shortTitle);
map.put(ent + ".", newEntry);
} }
} }
h2020Programmes.foreach(csvProgramme -> { return h2020Programmes.map(csvProgramme -> {
if (!csvProgramme.getCode().endsWith(".") && !csvProgramme.getCode().contains("Euratom")
&& !csvProgramme.getCode().equals("H2020-EC")) String code = csvProgramme.getCode();
csvProgramme.setClassification(map.get(csvProgramme.getCode() + ".")); if (!code.endsWith(".") && !code.contains("Euratom")
else && !code.equals("H2020-EC"))
csvProgramme.setClassification(map.get(csvProgramme.getCode())); code += ".";
});
csvProgramme.setClassification(map.get(code)._1());
csvProgramme.setClassification_short(map.get(code)._2());
return csvProgramme;
}).collect();
} }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(

View File

@ -9,7 +9,6 @@ import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.SequenceFileOutputFormat; import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -138,7 +137,8 @@ public class SparkAtomicActionJob {
pm.setCode(csvProject.getProgramme()); pm.setCode(csvProject.getProgramme());
h2020classification.setClassification(ocsvProgramme.get().getClassification()); h2020classification.setClassification(ocsvProgramme.get().getClassification());
h2020classification.setH2020Programme(pm); h2020classification.setH2020Programme(pm);
setLevelsAndProgramme(h2020classification, ocsvProgramme.get().getClassification()); setLevelsandProgramme(h2020classification, ocsvProgramme.get().getClassification_short());
// setProgramme(h2020classification, ocsvProgramme.get().getClassification());
pp.setH2020classification(Arrays.asList(h2020classification)); pp.setH2020classification(Arrays.asList(h2020classification));
return pp; return pp;
@ -177,8 +177,8 @@ public class SparkAtomicActionJob {
} }
private static void setLevelsAndProgramme(H2020Classification h2020Classification, String classification) { private static void setLevelsandProgramme(H2020Classification h2020Classification, String classification_short) {
String[] tmp = classification.split(" \\| "); String[] tmp = classification_short.split(" \\| ");
h2020Classification.setLevel1(tmp[0]); h2020Classification.setLevel1(tmp[0]);
if (tmp.length > 1) { if (tmp.length > 1) {
h2020Classification.setLevel2(tmp[1]); h2020Classification.setLevel2(tmp[1]);
@ -189,6 +189,12 @@ public class SparkAtomicActionJob {
h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]); h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
} }
// private static void setProgramme(H2020Classification h2020Classification, String classification) {
// String[] tmp = classification.split(" \\| ");
//
// h2020Classification.getH2020Programme().setDescription(tmp[tmp.length - 1]);
// }
public static <R> Dataset<R> readPath( public static <R> Dataset<R> readPath(
SparkSession spark, String inputPath, Class<R> clazz) { SparkSession spark, String inputPath, Class<R> clazz) {
return spark return spark

View File

@ -22,6 +22,15 @@ public class CSVProgramme implements Serializable {
private String shortTitle; private String shortTitle;
private String language; private String language;
private String classification; private String classification;
private String classification_short;
public String getClassification_short() {
return classification_short;
}
public void setClassification_short(String classification_short) {
this.classification_short = classification_short;
}
public String getClassification() { public String getClassification() {
return classification; return classification;

View File

@ -9,12 +9,14 @@ import java.util.List;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException; import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.Assertions;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException; import eu.dnetlib.dhp.actionmanager.project.httpconnector.CollectorServiceException;
import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector; import eu.dnetlib.dhp.actionmanager.project.httpconnector.HttpConnector;
import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser; import eu.dnetlib.dhp.actionmanager.project.utils.EXCELParser;
@Disabled
public class EXCELParserTest { public class EXCELParserTest {
private static Path workingDir; private static Path workingDir;

View File

@ -92,6 +92,8 @@ public class PrepareH2020ProgrammeTest {
Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count()); Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count());
// tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme)));
Assertions Assertions
.assertEquals( .assertEquals(
"Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft", "Societal challenges | Smart, Green And Integrated Transport | CLEANSKY2 | IADP Fast Rotorcraft",

View File

@ -78,7 +78,7 @@ public class SparkUpdateProjectTest {
"-programmePath", "-programmePath",
getClass() getClass()
.getResource( .getResource(
"/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_classification_whole.json.gz") "/eu/dnetlib/dhp/actionmanager/project/preparedProgramme_whole.json.gz")
.getPath(), .getPath(),
"-projectPath", "-projectPath",
getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(), getClass().getResource("/eu/dnetlib/dhp/actionmanager/project/prepared_projects.json").getPath(),
@ -124,7 +124,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Societal challenges", "Societal Challenges",
execverification execverification
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
.select("classification.level1") .select("classification.level1")
@ -133,7 +133,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Smart, Green And Integrated Transport", "Transport",
execverification execverification
.filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'") .filter("id = '40|corda__h2020::2c7298913008865ba784e5c1350a0aa5'")
.select("classification.level2") .select("classification.level2")
@ -188,7 +188,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Nurturing excellence by means of cross-border and cross-sector mobility", "MSCA Mobility",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.h2020Programme.description") .select("classification.h2020Programme.description")
@ -197,7 +197,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Excellent science", "Excellent Science",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level1") .select("classification.level1")
@ -206,7 +206,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Marie Skłodowska-Curie Actions", "Marie-Sklodowska-Curie Actions",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level2") .select("classification.level2")
@ -215,7 +215,7 @@ public class SparkUpdateProjectTest {
.getString(0)); .getString(0));
Assertions Assertions
.assertEquals( .assertEquals(
"Nurturing excellence by means of cross-border and cross-sector mobility", "MSCA Mobility",
execverification execverification
.filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'") .filter("id = '40|corda__h2020::1a1f235fdd06ef14790baec159aa1202'")
.select("classification.level3") .select("classification.level3")

View File

@ -6,8 +6,10 @@ import org.apache.commons.logging.LogFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory; import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.ssl.SSLContextBuilder; import org.apache.http.ssl.SSLContextBuilder;
import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
@Disabled
public class HttpConnectorTest { public class HttpConnectorTest {
private static final Log log = LogFactory.getLog(HttpConnectorTest.class); private static final Log log = LogFactory.getLog(HttpConnectorTest.class);

View File

@ -82,7 +82,7 @@ public class DedupRecordFactory {
final Collection<String> dates = Lists.newArrayList(); final Collection<String> dates = Lists.newArrayList();
final List<List<Author>> authors = Lists.newArrayList(); final List<List<Author>> authors = Lists.newArrayList();
final List<Identifier> bestPids = Lists.newArrayList(); // best pids list final List<Identifier<T>> bestPids = Lists.newArrayList(); // best pids list
entities entities
.forEachRemaining( .forEachRemaining(
@ -90,7 +90,7 @@ public class DedupRecordFactory {
T duplicate = t._2(); T duplicate = t._2();
// prepare the list of pids to use for the id generation // prepare the list of pids to use for the id generation
bestPids.addAll(IdGenerator.bestPidToIdentifier(duplicate)); bestPids.add(Identifier.newInstance(duplicate));
entity.mergeFrom(duplicate); entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) { if (ModelSupport.isSubClass(duplicate, Result.class)) {

View File

@ -1,124 +1,46 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import static org.apache.commons.lang3.StringUtils.substringBefore;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException; import java.util.List;
import java.text.SimpleDateFormat;
import java.util.*;
import org.apache.commons.lang.StringUtils;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.OafEntity; import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Result; import eu.dnetlib.dhp.schema.oaf.utils.PidType;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.dhp.utils.DHPUtils;
public class IdGenerator implements Serializable { public class IdGenerator implements Serializable {
public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
public static String BASE_DATE = "2000-01-01";
// pick the best pid from the list (consider date and pidtype) // pick the best pid from the list (consider date and pidtype)
public static String generate(List<Identifier> pids, String defaultID) { public static <T extends OafEntity> String generate(List<Identifier<T>> pids, String defaultID) {
if (pids == null || pids.size() == 0) if (pids == null || pids.size() == 0)
return defaultID; return defaultID;
Optional<Identifier> bp = pids Identifier<T> bp = pids
.stream() .stream()
.max(Identifier::compareTo); .min(Identifier::compareTo)
.get();
if (bp.get().isUseOriginal() || bp.get().getPid().getValue() == null) { String prefix = substringBefore(bp.getOriginalID(), "|");
return bp.get().getOriginalID().split("\\|")[0] + "|dedup_wf_001::" String ns = substringBefore(substringAfter(bp.getOriginalID(), "|"), "::");
+ DHPUtils.md5(bp.get().getOriginalID()); String suffix = substringAfter(bp.getOriginalID(), "::");
final String pidType = substringBefore(ns, "_");
if (PidType.isValid(pidType)) {
return prefix + "|" + dedupify(ns) + "::" + suffix;
} else { } else {
return bp.get().getOriginalID().split("\\|")[0] + "|" return prefix + "|dedup_wf_001::" + suffix;
+ createPrefix(bp.get().getPid().getQualifier().getClassid()) + "::"
+ DHPUtils.md5(bp.get().getPid().getValue());
} }
} }
public static <T extends OafEntity> ArrayList<Identifier> createBasePid(T entity, SimpleDateFormat sdf) { private static String dedupify(String ns) {
StringBuilder prefix = new StringBuilder(substringBefore(ns, "_")).append("_dedup");
Date date;
try {
date = sdf.parse(BASE_DATE);
} catch (ParseException e) {
date = new Date();
}
return Lists
.newArrayList(
new Identifier(new StructuredProperty(), date, PidType.original, entity.getCollectedfrom(),
EntityType.fromClass(entity.getClass()), entity.getId()));
}
// pick the best pid from the entity. Returns a list (length 1) to save time in the call
public static <T extends OafEntity> List<Identifier> bestPidToIdentifier(T entity) {
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
if (entity.getPid() == null || entity.getPid().size() == 0)
return createBasePid(entity, sdf);
Optional<StructuredProperty> bp = entity
.getPid()
.stream()
.filter(pid -> PidType.classidValueOf(pid.getQualifier().getClassid()) != PidType.undefined)
.max(Comparator.comparing(pid -> PidType.classidValueOf(pid.getQualifier().getClassid())));
return bp
.map(
structuredProperty -> Lists
.newArrayList(
new Identifier(structuredProperty, extractDate(entity, sdf),
PidType.classidValueOf(structuredProperty.getQualifier().getClassid()),
entity.getCollectedfrom(), EntityType.fromClass(entity.getClass()), entity.getId())))
.orElseGet(() -> createBasePid(entity, sdf));
}
// create the prefix (length = 12): dedup_+ pidType
public static String createPrefix(String pidType) {
StringBuilder prefix = new StringBuilder("dedup_" + pidType);
while (prefix.length() < 12) { while (prefix.length() < 12) {
prefix.append("_"); prefix.append("_");
} }
return prefix.toString().substring(0, 12); return prefix.substring(0, 12);
} }
// extracts the date from the record. If the date is not available or is not wellformed, it returns a base date:
// 00-01-01
public static <T extends OafEntity> Date extractDate(T duplicate, SimpleDateFormat sdf) {
String date = BASE_DATE;
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result result = (Result) duplicate;
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
}
public static boolean isWellformed(Field<String> date) {
return date != null && StringUtils.isNotBlank(date.getValue())
&& date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
}
} }

View File

@ -2,93 +2,85 @@
package eu.dnetlib.dhp.oa.dedup.model; package eu.dnetlib.dhp.oa.dedup.model;
import java.io.Serializable; import java.io.Serializable;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Optional;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Sets; import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.dedup.IdGenerator;
import eu.dnetlib.dhp.oa.dedup.DatePicker;
import eu.dnetlib.dhp.schema.common.EntityType; import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.PidComparator;
import eu.dnetlib.dhp.schema.oaf.utils.PidType;
public class Identifier implements Serializable, Comparable<Identifier> { public class Identifier<T extends OafEntity> implements Serializable, Comparable<Identifier> {
StructuredProperty pid; public static String CROSSREF_ID = "10|openaire____::081b82f96300b6a6e3d282bad31cb6e2";
Date date; public static String DATACITE_ID = "10|openaire____::9e3be59865b2c1c335d32dae2fe7b254";
PidType type; public static String BASE_DATE = "2000-01-01";
List<KeyValue> collectedFrom;
EntityType entityType;
String originalID;
boolean useOriginal = false; // to know if the top identifier won because of the alphabetical order of the original private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// ID
public Identifier(StructuredProperty pid, Date date, PidType type, List<KeyValue> collectedFrom, private T entity;
EntityType entityType, String originalID) {
this.pid = pid; public static <T extends OafEntity> Identifier newInstance(T entity) {
this.date = date; return new Identifier(entity);
this.type = type;
this.collectedFrom = collectedFrom;
this.entityType = entityType;
this.originalID = originalID;
} }
public StructuredProperty getPid() { public Identifier(T entity) {
return pid; this.entity = entity;
} }
public void setPid(StructuredProperty pid) { public T getEntity() {
this.pid = pid; return entity;
}
public void setEntity(T entity) {
this.entity = entity;
} }
public Date getDate() { public Date getDate() {
return date; String date = BASE_DATE;
if (ModelSupport.isSubClass(getEntity(), Result.class)) {
Result result = (Result) getEntity();
if (isWellformed(result.getDateofacceptance())) {
date = result.getDateofacceptance().getValue();
}
}
try {
return sdf.parse(date);
} catch (ParseException e) {
return new Date();
}
} }
public void setDate(Date date) { private static boolean isWellformed(Field<String> date) {
this.date = date; return date != null && StringUtils.isNotBlank(date.getValue())
} && date.getValue().matches(DatePicker.DATE_PATTERN) && DatePicker.inRange(date.getValue());
public PidType getType() {
return type;
}
public void setType(PidType type) {
this.type = type;
} }
public List<KeyValue> getCollectedFrom() { public List<KeyValue> getCollectedFrom() {
return collectedFrom; return entity.getCollectedfrom();
}
public void setCollectedFrom(List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
} }
public EntityType getEntityType() { public EntityType getEntityType() {
return entityType; return EntityType.fromClass(entity.getClass());
}
public void setEntityType(EntityType entityType) {
this.entityType = entityType;
} }
public String getOriginalID() { public String getOriginalID() {
return originalID; return entity.getId();
} }
public void setOriginalID(String originalID) { private PidType getPidType() {
this.originalID = originalID; return PidType.tryValueOf(StringUtils.substringBefore(StringUtils.substringAfter(entity.getId(), "|"), "_"));
}
public boolean isUseOriginal() {
return useOriginal;
}
public void setUseOriginal(boolean useOriginal) {
this.useOriginal = useOriginal;
} }
@Override @Override
@ -96,50 +88,50 @@ public class Identifier implements Serializable, Comparable<Identifier> {
// priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4) // priority in comparisons: 1) pidtype, 2) collectedfrom (depending on the entity type) , 3) date 4)
// alphabetical order of the originalID // alphabetical order of the originalID
Set<String> lKeys = Sets.newHashSet(); Set<String> lKeys = Optional
if (this.collectedFrom != null) .ofNullable(getCollectedFrom())
lKeys = this.collectedFrom.stream().map(KeyValue::getKey).collect(Collectors.toSet()); .map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
Set<String> rKeys = Sets.newHashSet(); final Optional<List<KeyValue>> cf = Optional.ofNullable(i.getCollectedFrom());
if (i.getCollectedFrom() != null) Set<String> rKeys = cf
rKeys = i.getCollectedFrom().stream().map(KeyValue::getKey).collect(Collectors.toSet()); .map(c -> c.stream().map(KeyValue::getKey).collect(Collectors.toSet()))
.orElse(Sets.newHashSet());
if (this.getType().compareTo(i.getType()) == 0) { // same type if (this.getPidType().compareTo(i.getPidType()) == 0) { // same type
if (entityType == EntityType.publication) { if (getEntityType() == EntityType.publication) {
if (isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID) if (isFromDatasourceID(lKeys, CROSSREF_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)) && !isFromDatasourceID(rKeys, CROSSREF_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.CROSSREF_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.CROSSREF_ID))
return -1; return -1;
if (isFromDatasourceID(rKeys, CROSSREF_ID)
&& !isFromDatasourceID(lKeys, CROSSREF_ID))
return 1;
} }
if (entityType == EntityType.dataset) { if (getEntityType() == EntityType.dataset) {
if (isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID) if (isFromDatasourceID(lKeys, DATACITE_ID)
&& !isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)) && !isFromDatasourceID(rKeys, DATACITE_ID))
return 1;
if (isFromDatasourceID(rKeys, IdGenerator.DATACITE_ID)
&& !isFromDatasourceID(lKeys, IdGenerator.DATACITE_ID))
return -1; return -1;
if (isFromDatasourceID(rKeys, DATACITE_ID)
&& !isFromDatasourceID(lKeys, DATACITE_ID))
return 1;
} }
if (this.getDate().compareTo(i.getDate()) == 0) {// same date if (this.getDate().compareTo(i.getDate()) == 0) {// same date
if (this.originalID.compareTo(i.originalID) < 0)
this.useOriginal = true;
else
i.setUseOriginal(true);
// the minus because we need to take the alphabetically lower id // the minus because we need to take the alphabetically lower id
return -this.originalID.compareTo(i.originalID); return this.getOriginalID().compareTo(i.getOriginalID());
} else } else
// the minus is because we need to take the elder date // the minus is because we need to take the elder date
return -this.getDate().compareTo(i.getDate()); return this.getDate().compareTo(i.getDate());
} else { } else {
return this.getType().compareTo(i.getType()); return new PidComparator<>(getEntity()).compare(toSP(getPidType()), toSP(i.getPidType()));
} }
} }
private StructuredProperty toSP(PidType pidType) {
return OafMapperUtils.structuredProperty("", pidType.toString(), pidType.toString(), "", "", new DataInfo());
}
public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) { public boolean isFromDatasourceID(Set<String> collectedFrom, String dsId) {
return collectedFrom.contains(dsId); return collectedFrom.contains(dsId);
} }

View File

@ -1,17 +0,0 @@
package eu.dnetlib.dhp.oa.dedup.model;
public enum PidType {
// from the less to the more important
undefined, original, orcid, ror, grid, pdb, arXiv, pmid, pmc, doi;
public static PidType classidValueOf(String s) {
try {
return PidType.valueOf(s);
} catch (Exception e) {
return PidType.undefined;
}
}
}

View File

@ -9,6 +9,7 @@ import java.io.IOException;
import java.io.Serializable; import java.io.Serializable;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.util.*; import java.util.*;
import java.util.stream.Collectors;
import org.codehaus.jackson.map.ObjectMapper; import org.codehaus.jackson.map.ObjectMapper;
import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeEach;
@ -21,16 +22,16 @@ import scala.Tuple2;
public class EntityMergerTest implements Serializable { public class EntityMergerTest implements Serializable {
List<Tuple2<String, Publication>> publications; private List<Tuple2<String, Publication>> publications;
List<Tuple2<String, Publication>> publications2; private List<Tuple2<String, Publication>> publications2;
List<Tuple2<String, Publication>> publications3; private List<Tuple2<String, Publication>> publications3;
List<Tuple2<String, Publication>> publications4; private List<Tuple2<String, Publication>> publications4;
List<Tuple2<String, Publication>> publications5; private List<Tuple2<String, Publication>> publications5;
String testEntityBasePath; private String testEntityBasePath;
DataInfo dataInfo; private DataInfo dataInfo;
String dedupId = "00|dedup_id::1"; private String dedupId = "00|dedup_id::1";
Publication pub_top; private Publication pub_top;
@BeforeEach @BeforeEach
public void setUp() throws Exception { public void setUp() throws Exception {
@ -61,9 +62,9 @@ public class EntityMergerTest implements Serializable {
Software merged = DedupRecordFactory Software merged = DedupRecordFactory
.entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class); .entityMerger(dedupId, softwares.iterator(), 0, dataInfo, Software.class);
assertEquals(merged.getBestaccessright().getClassid(), "OPEN SOURCE"); assertEquals("OPEN SOURCE", merged.getBestaccessright().getClassid());
assertEquals(merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340"); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", merged.getId());
} }
@ -74,45 +75,45 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals(pub_merged.getId(), "50|dedup_doi___::0968af610a356656706657e4f234b340"); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", pub_merged.getId());
assertEquals(pub_merged.getJournal(), pub_top.getJournal()); assertEquals(pub_top.getJournal(), pub_merged.getJournal());
assertEquals(pub_merged.getBestaccessright().getClassid(), "OPEN"); assertEquals("OPEN", pub_merged.getBestaccessright().getClassid());
assertEquals(pub_merged.getResulttype(), pub_top.getResulttype()); assertEquals(pub_top.getResulttype(), pub_merged.getResulttype());
assertEquals(pub_merged.getLanguage(), pub_merged.getLanguage()); assertEquals(pub_top.getLanguage(), pub_merged.getLanguage());
assertEquals(pub_merged.getPublisher(), pub_top.getPublisher()); assertEquals(pub_top.getPublisher(), pub_merged.getPublisher());
assertEquals(pub_merged.getEmbargoenddate(), pub_top.getEmbargoenddate()); assertEquals(pub_top.getEmbargoenddate(), pub_merged.getEmbargoenddate());
assertEquals(pub_merged.getResourcetype().getClassid(), "0004"); assertEquals(pub_top.getResourcetype().getClassid(), "");
assertEquals(pub_merged.getDateoftransformation(), pub_top.getDateoftransformation()); assertEquals(pub_top.getDateoftransformation(), pub_merged.getDateoftransformation());
assertEquals(pub_merged.getOaiprovenance(), pub_top.getOaiprovenance()); assertEquals(pub_top.getOaiprovenance(), pub_merged.getOaiprovenance());
assertEquals(pub_merged.getDateofcollection(), pub_top.getDateofcollection()); assertEquals(pub_top.getDateofcollection(), pub_merged.getDateofcollection());
assertEquals(pub_merged.getInstance().size(), 3); assertEquals(3, pub_merged.getInstance().size());
assertEquals(pub_merged.getCountry().size(), 2); assertEquals(2, pub_merged.getCountry().size());
assertEquals(pub_merged.getSubject().size(), 0); assertEquals(0, pub_merged.getSubject().size());
assertEquals(pub_merged.getTitle().size(), 2); assertEquals(2, pub_merged.getTitle().size());
assertEquals(pub_merged.getRelevantdate().size(), 0); assertEquals(0, pub_merged.getRelevantdate().size());
assertEquals(pub_merged.getDescription().size(), 0); assertEquals(0, pub_merged.getDescription().size());
assertEquals(pub_merged.getSource().size(), 0); assertEquals(0, pub_merged.getSource().size());
assertEquals(pub_merged.getFulltext().size(), 0); assertEquals(0, pub_merged.getFulltext().size());
assertEquals(pub_merged.getFormat().size(), 0); assertEquals(0, pub_merged.getFormat().size());
assertEquals(pub_merged.getContributor().size(), 0); assertEquals(0, pub_merged.getContributor().size());
assertEquals(pub_merged.getCoverage().size(), 0); assertEquals(0, pub_merged.getCoverage().size());
assertEquals(pub_merged.getContext().size(), 0); assertEquals(0, pub_merged.getContext().size());
assertEquals(pub_merged.getExternalReference().size(), 0); assertEquals(0, pub_merged.getExternalReference().size());
assertEquals(pub_merged.getOriginalId().size(), 3); assertEquals(3, pub_merged.getOriginalId().size());
assertEquals(pub_merged.getCollectedfrom().size(), 3); assertEquals(3, pub_merged.getCollectedfrom().size());
assertEquals(pub_merged.getPid().size(), 1); assertEquals(1, pub_merged.getPid().size());
assertEquals(pub_merged.getExtraInfo().size(), 0); assertEquals(0, pub_merged.getExtraInfo().size());
// verify datainfo // verify datainfo
assertEquals(pub_merged.getDataInfo(), dataInfo); assertEquals(dataInfo, pub_merged.getDataInfo());
// verify datepicker // verify datepicker
assertEquals(pub_merged.getDateofacceptance().getValue(), "2018-09-30"); assertEquals("2018-09-30", pub_merged.getDateofacceptance().getValue());
// verify authors // verify authors
assertEquals(pub_merged.getAuthor().size(), 9); assertEquals(9, pub_merged.getAuthor().size());
assertEquals(AuthorMerger.countAuthorsPids(pub_merged.getAuthor()), 4); assertEquals(4, AuthorMerger.countAuthorsPids(pub_merged.getAuthor()));
// verify title // verify title
int count = 0; int count = 0;
@ -120,7 +121,7 @@ public class EntityMergerTest implements Serializable {
if (title.getQualifier().getClassid().equals("main title")) if (title.getQualifier().getClassid().equals("main title"))
count++; count++;
} }
assertEquals(count, 1); assertEquals(1, count);
} }
@Test @Test
@ -130,9 +131,9 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications2.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId()); assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
assertEquals(pub_merged.getAuthor().size(), 27); assertEquals(27, pub_merged.getAuthor().size());
} }
@Test @Test
@ -142,7 +143,7 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications3.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_doi___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId()); assertEquals("50|doi_dedup___::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
@Test @Test
@ -152,17 +153,24 @@ public class EntityMergerTest implements Serializable {
.entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications4.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::2d2bbbbcfb285e3fb3590237b79e2fa8", pub_merged.getId()); assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
@Test @Test
public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException { public void publicationMergerTest5() throws InstantiationException, IllegalStateException, IllegalAccessException {
System.out
.println(
publications5
.stream()
.map(p -> p._2().getId())
.collect(Collectors.toList()));
Publication pub_merged = DedupRecordFactory Publication pub_merged = DedupRecordFactory
.entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class); .entityMerger(dedupId, publications5.iterator(), 0, dataInfo, Publication.class);
// verify id // verify id
assertEquals("50|dedup_wf_001::584b89679c3ccd1015b647ec63cc2699", pub_merged.getId()); assertEquals("50|dedup_wf_001::0ca46ff10b2b4c756191719d85302b14", pub_merged.getId());
} }
public DataInfo setDI() { public DataInfo setDI() {

View File

@ -7,96 +7,57 @@ import java.io.BufferedReader;
import java.io.FileReader; import java.io.FileReader;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Paths; import java.nio.file.Paths;
import java.text.SimpleDateFormat;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.List; import java.util.List;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.codehaus.jackson.map.ObjectMapper; import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.*; import org.junit.jupiter.api.MethodOrderer;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.TestMethodOrder;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.dedup.model.Identifier; import eu.dnetlib.dhp.oa.dedup.model.Identifier;
import eu.dnetlib.dhp.oa.dedup.model.PidType; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.pace.util.MapDocumentUtil; import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class IdGeneratorTest { public class IdGeneratorTest {
private static List<Identifier> bestIds; private static ObjectMapper OBJECT_MAPPER = new ObjectMapper()
private static List<Tuple2<String, Publication>> pubs; .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
private static List<Identifier> bestIds2; private static List<Identifier<Publication>> bestIds;
private static List<Identifier> bestIds3; private static List<Identifier<Publication>> bestIds2;
private static List<Identifier<Publication>> bestIds3;
private static String testEntityBasePath; private static String testEntityBasePath;
private static SimpleDateFormat sdf;
private static Date baseDate;
@BeforeAll @BeforeAll
public static void setUp() throws Exception { public static void setUp() throws Exception {
sdf = new SimpleDateFormat("yyyy-MM-dd");
baseDate = sdf.parse("2000-01-01");
bestIds = new ArrayList<>();
bestIds2 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
bestIds3 = Lists
.newArrayList(
new Identifier(pid("pid1", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID1"),
new Identifier(pid("pid2", "doi", "doi"), baseDate, PidType.doi, keyValue("key", "value"),
EntityType.publication, "50|originalID2"),
new Identifier(pid("pid3", "original", "original"), baseDate, PidType.original,
keyValue("key", "value"), EntityType.publication, "50|originalID3"));
testEntityBasePath = Paths testEntityBasePath = Paths
.get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI()) .get(SparkDedupTest.class.getResource("/eu/dnetlib/dhp/dedup/json").toURI())
.toFile() .toFile()
.getAbsolutePath(); .getAbsolutePath();
pubs = readSample(testEntityBasePath + "/publication_idgeneration.json", Publication.class); bestIds = createBestIds(testEntityBasePath + "/publication_idgeneration.json", Publication.class);
bestIds2 = createBestIds(testEntityBasePath + "/publication_idgeneration2.json", Publication.class);
bestIds3 = createBestIds(testEntityBasePath + "/publication_idgeneration3.json", Publication.class);
} }
@Test @Test
@Order(1)
public void bestPidToIdentifierTest() {
List<String> typesForAssertions = Lists
.newArrayList(PidType.pmc.toString(), PidType.doi.toString(), PidType.doi.toString());
for (Tuple2<String, Publication> pub : pubs) {
List<Identifier> ids = IdGenerator.bestPidToIdentifier(pub._2());
assertEquals(typesForAssertions.get(pubs.indexOf(pub)), ids.get(0).getPid().getQualifier().getClassid());
bestIds.addAll(ids);
}
}
@Test
@Order(2)
public void generateIdTest1() { public void generateIdTest1() {
String id1 = IdGenerator.generate(bestIds, "50|defaultID"); String id1 = IdGenerator.generate(bestIds, "50|defaultID");
System.out.println("id list 1 = " + bestIds.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); System.out
.println("id list 1 = " + bestIds.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
assertEquals("50|dedup_wf_001::9c5cfbf993d38476e0f959a301239719", id1); assertEquals("50|doi_dedup___::0968af610a356656706657e4f234b340", id1);
} }
@Test @Test
@ -104,13 +65,23 @@ public class IdGeneratorTest {
String id1 = IdGenerator.generate(bestIds2, "50|defaultID"); String id1 = IdGenerator.generate(bestIds2, "50|defaultID");
String id2 = IdGenerator.generate(bestIds3, "50|defaultID"); String id2 = IdGenerator.generate(bestIds3, "50|defaultID");
System.out.println("id list 2 = " + bestIds2.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); System.out
.println("id list 2 = " + bestIds2.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 2 = " + id1); System.out.println("winner 2 = " + id1);
System.out.println("id list 3 = " + bestIds3.stream().map(i -> i.getPid().getValue()).collect(Collectors.toList())); System.out
.println("id list 3 = " + bestIds3.stream().map(i -> i.getOriginalID()).collect(Collectors.toList()));
System.out.println("winner 3 = " + id2); System.out.println("winner 3 = " + id2);
assertEquals("50|dedup_wf_001::2c56cc1914bffdb30fdff354e0099612", id1); assertEquals("50|doi_dedup___::1a77a3bba737f8b669dcf330ad3b37e2", id1);
assertEquals("50|dedup_doi___::128ead3ed8d9ecf262704b6fcf592b8d", id2); assertEquals("50|dedup_wf_001::0829b5191605bdbea36d6502b8c1ce1g", id2);
}
protected static <T extends OafEntity> List<Identifier<T>> createBestIds(String path, Class<T> clazz) {
final Stream<Identifier<T>> ids = readSample(path, clazz)
.stream()
.map(Tuple2::_2)
.map(Identifier::newInstance);
return ids.collect(Collectors.toList());
} }
public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) { public static <T> List<Tuple2<String, T>> readSample(String path, Class<T> clazz) {
@ -124,7 +95,7 @@ public class IdGeneratorTest {
.add( .add(
new Tuple2<>( new Tuple2<>(
MapDocumentUtil.getJPathString("$.id", line), MapDocumentUtil.getJPathString("$.id", line),
new ObjectMapper().readValue(line, clazz))); OBJECT_MAPPER.readValue(line, clazz)));
// read next line // read next line
line = reader.readLine(); line = reader.readLine();
} }
@ -137,23 +108,10 @@ public class IdGeneratorTest {
} }
public static StructuredProperty pid(String pid, String classid, String classname) { public static StructuredProperty pid(String pid, String classid, String classname) {
return OafMapperUtils.structuredProperty(pid, classid, classname, "", "", new DataInfo());
StructuredProperty sp = new StructuredProperty();
sp.setValue(pid);
Qualifier q = new Qualifier();
q.setSchemeid(classid);
q.setSchemename(classname);
q.setClassname(classname);
q.setClassid(classid);
sp.setQualifier(q);
return sp;
} }
public static List<KeyValue> keyValue(String key, String value) { public static List<KeyValue> keyValue(String key, String value) {
return Lists.newArrayList(OafMapperUtils.keyValue(key, value));
KeyValue kv = new KeyValue();
kv.setKey(key);
kv.setValue(value);
return Lists.newArrayList(kv);
} }
} }

View File

@ -1,12 +1,18 @@
package eu.dnetlib.dhp.oa.dedup; package eu.dnetlib.dhp.oa.dedup;
import com.fasterxml.jackson.databind.ObjectMapper; import static java.nio.file.Files.createTempDirectory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation; import static org.apache.spark.sql.functions.count;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import static org.junit.jupiter.api.Assertions.assertEquals;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import static org.mockito.Mockito.lenient;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import org.apache.commons.io.FileUtils; import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -25,19 +31,16 @@ import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock; import org.mockito.Mock;
import org.mockito.Mockito; import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension; import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import scala.Tuple2; import scala.Tuple2;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
import java.net.URISyntaxException;
import java.nio.file.Paths;
import static java.nio.file.Files.createTempDirectory;
import static org.apache.spark.sql.functions.count;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.mockito.Mockito.lenient;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class) @TestMethodOrder(MethodOrderer.OrderAnnotation.class)
public class SparkDedupTest implements Serializable { public class SparkDedupTest implements Serializable {

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|doi_________::1a77a3bba737f8b669dcf330ad3b37e2", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "doi" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1f", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }

View File

@ -0,0 +1,3 @@
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1g", "pid" : [ { "value" : "pid1", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1h", "pid" : [ { "value" : "pid2", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }
{ "id" : "50|DansKnawCris::0829b5191605bdbea36d6502b8c1ce1i", "pid" : [ { "value" : "pid3", "qualifier" : { "classid" : "original" } } ], "dateofacceptance" : { "value" : "2000-01-01"}, "collectedfrom" : [ { "key" : "key", "value" : "value" } ] }

View File

@ -1,13 +1,15 @@
package eu.dnetlib.doiboost package eu.dnetlib.doiboost
import eu.dnetlib.dhp.application.ArgumentApplicationParser import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Publication, Relation, Dataset => OafDataset, Organization} import eu.dnetlib.dhp.oa.merge.AuthorMerger
import eu.dnetlib.dhp.schema.oaf.{Organization, Publication, Relation, Dataset => OafDataset}
import eu.dnetlib.doiboost.mag.ConversionUtil import eu.dnetlib.doiboost.mag.ConversionUtil
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.SparkConf import org.apache.spark.SparkConf
import org.apache.spark.sql.functions.col import org.apache.spark.sql.functions.col
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.{Logger, LoggerFactory} import org.slf4j.{Logger, LoggerFactory}
import scala.collection.JavaConverters._ import scala.collection.JavaConverters._
object SparkGenerateDoiBoost { object SparkGenerateDoiBoost {
@ -49,6 +51,7 @@ object SparkGenerateDoiBoost {
val otherPub = item._2._2 val otherPub = item._2._2
if (otherPub != null) { if (otherPub != null) {
crossrefPub.mergeFrom(otherPub) crossrefPub.mergeFrom(otherPub)
crossrefPub.setAuthor(AuthorMerger.mergeAuthor(crossrefPub.getAuthor, otherPub.getAuthor))
} }
} }
crossrefPub crossrefPub

View File

@ -38,6 +38,8 @@ class QueryTest {
def myQuery(spark:SparkSession, sc:SparkContext): Unit = { def myQuery(spark:SparkSession, sc:SparkContext): Unit = {
implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication] implicit val mapEncoderPub: Encoder[Publication] = Encoders.kryo[Publication]
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT) mapper.getSerializationConfig.enable(SerializationConfig.Feature.INDENT_OUTPUT)

View File

@ -3,13 +3,9 @@ package eu.dnetlib.dhp.oa.graph.clean;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.BufferedInputStream;
import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction; import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
@ -23,10 +19,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.AbstractMdRecordToOafMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -1,8 +1,8 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.*;
import java.util.*; import java.util.*;
@ -12,10 +12,13 @@ import org.dom4j.DocumentFactory;
import org.dom4j.DocumentHelper; import org.dom4j.DocumentHelper;
import org.dom4j.Node; import org.dom4j.Node;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.LicenseComparator; import eu.dnetlib.dhp.schema.common.LicenseComparator;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
public abstract class AbstractMdRecordToOafMapper { public abstract class AbstractMdRecordToOafMapper {
@ -133,20 +136,34 @@ public abstract class AbstractMdRecordToOafMapper {
final DataInfo info, final DataInfo info,
final long lastUpdateTimestamp) { final long lastUpdateTimestamp) {
final List<Oaf> oafs = new ArrayList<>(); final OafEntity entity = createEntity(doc, type, instances, collectedFrom, info, lastUpdateTimestamp);
final String id = IdentifierFactory.createIdentifier(entity);
if (!id.equals(entity.getId())) {
entity.getOriginalId().add(entity.getId());
entity.setId(id);
}
final List<Oaf> oafs = Lists.newArrayList(entity);
if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, entity));
oafs.addAll(addOtherResultRels(doc, entity));
}
return oafs;
}
private OafEntity createEntity(Document doc, String type, List<Instance> instances, KeyValue collectedFrom,
DataInfo info, long lastUpdateTimestamp) {
switch (type.toLowerCase()) { switch (type.toLowerCase()) {
case "publication": case "publication":
final Publication p = new Publication(); final Publication p = new Publication();
populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp); populateResultFields(p, doc, instances, collectedFrom, info, lastUpdateTimestamp);
p.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE);
p.setJournal(prepareJournal(doc, info)); p.setJournal(prepareJournal(doc, info));
oafs.add(p); return p;
break;
case "dataset": case "dataset":
final Dataset d = new Dataset(); final Dataset d = new Dataset();
populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp); populateResultFields(d, doc, instances, collectedFrom, info, lastUpdateTimestamp);
d.setResulttype(DATASET_DEFAULT_RESULTTYPE);
d.setStoragedate(prepareDatasetStorageDate(doc, info)); d.setStoragedate(prepareDatasetStorageDate(doc, info));
d.setDevice(prepareDatasetDevice(doc, info)); d.setDevice(prepareDatasetDevice(doc, info));
d.setSize(prepareDatasetSize(doc, info)); d.setSize(prepareDatasetSize(doc, info));
@ -154,48 +171,34 @@ public abstract class AbstractMdRecordToOafMapper {
d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info)); d.setLastmetadataupdate(prepareDatasetLastMetadataUpdate(doc, info));
d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info)); d.setMetadataversionnumber(prepareDatasetMetadataVersionNumber(doc, info));
d.setGeolocation(prepareDatasetGeoLocations(doc, info)); d.setGeolocation(prepareDatasetGeoLocations(doc, info));
oafs.add(d); return d;
break;
case "software": case "software":
final Software s = new Software(); final Software s = new Software();
populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp); populateResultFields(s, doc, instances, collectedFrom, info, lastUpdateTimestamp);
s.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE);
s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info)); s.setDocumentationUrl(prepareSoftwareDocumentationUrls(doc, info));
s.setLicense(prepareSoftwareLicenses(doc, info)); s.setLicense(prepareSoftwareLicenses(doc, info));
s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info)); s.setCodeRepositoryUrl(prepareSoftwareCodeRepositoryUrl(doc, info));
s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info)); s.setProgrammingLanguage(prepareSoftwareProgrammingLanguage(doc, info));
oafs.add(s); return s;
break;
case "": case "":
case "otherresearchproducts": case "otherresearchproducts":
default: default:
final OtherResearchProduct o = new OtherResearchProduct(); final OtherResearchProduct o = new OtherResearchProduct();
populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp); populateResultFields(o, doc, instances, collectedFrom, info, lastUpdateTimestamp);
o.setResulttype(ORP_DEFAULT_RESULTTYPE);
o.setContactperson(prepareOtherResearchProductContactPersons(doc, info)); o.setContactperson(prepareOtherResearchProductContactPersons(doc, info));
o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info)); o.setContactgroup(prepareOtherResearchProductContactGroups(doc, info));
o.setTool(prepareOtherResearchProductTools(doc, info)); o.setTool(prepareOtherResearchProductTools(doc, info));
oafs.add(o); return o;
break;
} }
if (!oafs.isEmpty()) {
oafs.addAll(addProjectRels(doc, collectedFrom, info, lastUpdateTimestamp));
oafs.addAll(addOtherResultRels(doc, collectedFrom, info, lastUpdateTimestamp));
}
return oafs;
} }
private List<Oaf> addProjectRels( private List<Oaf> addProjectRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final OafEntity entity) {
final DataInfo info,
final long lastUpdateTimestamp) {
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); final String docId = entity.getId();
for (final Object o : doc.selectNodes("//oaf:projectid")) { for (final Object o : doc.selectNodes("//oaf:projectid")) {
@ -207,13 +210,11 @@ public abstract class AbstractMdRecordToOafMapper {
res res
.add( .add(
getRelation( getRelation(
docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, collectedFrom, info, docId, projectId, RESULT_PROJECT, OUTCOME, IS_PRODUCED_BY, entity));
lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, collectedFrom, info, projectId, docId, RESULT_PROJECT, OUTCOME, PRODUCES, entity));
lastUpdateTimestamp));
} }
} }
@ -225,26 +226,22 @@ public abstract class AbstractMdRecordToOafMapper {
final String relType, final String relType,
final String subRelType, final String subRelType,
final String relClass, final String relClass,
final KeyValue collectedFrom, final OafEntity entity) {
final DataInfo info,
final long lastUpdateTimestamp) {
final Relation rel = new Relation(); final Relation rel = new Relation();
rel.setRelType(relType); rel.setRelType(relType);
rel.setSubRelType(subRelType); rel.setSubRelType(subRelType);
rel.setRelClass(relClass); rel.setRelClass(relClass);
rel.setSource(source); rel.setSource(source);
rel.setTarget(target); rel.setTarget(target);
rel.setCollectedfrom(Arrays.asList(collectedFrom)); rel.setCollectedfrom(entity.getCollectedfrom());
rel.setDataInfo(info); rel.setDataInfo(entity.getDataInfo());
rel.setLastupdatetimestamp(lastUpdateTimestamp); rel.setLastupdatetimestamp(entity.getLastupdatetimestamp());
return rel; return rel;
} }
protected abstract List<Oaf> addOtherResultRels( protected abstract List<Oaf> addOtherResultRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final OafEntity entity);
final DataInfo info,
final long lastUpdateTimestamp);
private void populateResultFields( private void populateResultFields(
final Result r, final Result r,
@ -257,7 +254,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false)); r.setId(createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false));
r.setOriginalId(Arrays.asList(findOriginalId(doc))); r.setOriginalId(Lists.newArrayList(findOriginalId(doc)));
r.setCollectedfrom(Arrays.asList(collectedFrom)); r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setPid(prepareResultPids(doc, info)); r.setPid(prepareResultPids(doc, info));
@ -285,7 +282,7 @@ public abstract class AbstractMdRecordToOafMapper {
r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES r.setExternalReference(new ArrayList<>()); // NOT PRESENT IN MDSTORES
r.setInstance(instances); r.setInstance(instances);
r.setBestaccessright(getBestAccessRights(instances)); r.setBestaccessright(OafMapperUtils.createBestAccessRights(instances));
} }
protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info); protected abstract List<StructuredProperty> prepareResultPids(Document doc, DataInfo info);
@ -370,38 +367,6 @@ public abstract class AbstractMdRecordToOafMapper {
protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info); protected abstract Field<String> prepareDatasetStorageDate(Document doc, DataInfo info);
public static Qualifier createBestAccessRights(final List<Instance> instanceList) {
return getBestAccessRights(instanceList);
}
protected static Qualifier getBestAccessRights(final List<Instance> instanceList) {
if (instanceList != null) {
final Optional<Qualifier> min = instanceList
.stream()
.map(i -> i.getAccessright())
.min(new LicenseComparator());
final Qualifier rights = min.isPresent() ? min.get() : new Qualifier();
if (StringUtils.isBlank(rights.getClassid())) {
rights.setClassid(UNKNOWN);
}
if (StringUtils.isBlank(rights.getClassname())
|| UNKNOWN.equalsIgnoreCase(rights.getClassname())) {
rights.setClassname(NOT_AVAILABLE);
}
if (StringUtils.isBlank(rights.getSchemeid())) {
rights.setSchemeid(DNET_ACCESS_MODES);
}
if (StringUtils.isBlank(rights.getSchemename())) {
rights.setSchemename(DNET_ACCESS_MODES);
}
return rights;
}
return null;
}
private Journal prepareJournal(final Document doc, final DataInfo info) { private Journal prepareJournal(final Document doc, final DataInfo info) {
final Node n = doc.selectSingleNode("//oaf:journal"); final Node n = doc.selectSingleNode("//oaf:journal");
if (n != null) { if (n != null) {
@ -564,4 +529,5 @@ public abstract class AbstractMdRecordToOafMapper {
} }
return res; return res;
} }
} }

View File

@ -4,11 +4,9 @@ package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.IOException; import java.io.IOException;
import java.util.Arrays; import java.util.*;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
@ -20,6 +18,7 @@ import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession; import org.apache.spark.sql.SparkSession;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
@ -29,16 +28,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport; import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelSupport; import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.utils.ISLookupClientFactory; import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import scala.Tuple2; import scala.Tuple2;
@ -124,7 +114,18 @@ public class GenerateEntitiesApplication {
private static Oaf merge(final Oaf o1, final Oaf o2) { private static Oaf merge(final Oaf o1, final Oaf o2) {
if (ModelSupport.isSubClass(o1, OafEntity.class)) { if (ModelSupport.isSubClass(o1, OafEntity.class)) {
((OafEntity) o1).mergeFrom((OafEntity) o2); if (ModelSupport.isSubClass(o1, Result.class)) {
return mergeResults((Result) o1, (Result) o2);
} else if (ModelSupport.isSubClass(o1, Datasource.class)) {
((Datasource) o1).mergeFrom((Datasource) o2);
} else if (ModelSupport.isSubClass(o1, Organization.class)) {
((Organization) o1).mergeFrom((Organization) o2);
} else if (ModelSupport.isSubClass(o1, Project.class)) {
((Project) o1).mergeFrom((Project) o2);
} else {
throw new RuntimeException("invalid OafEntity subtype:" + o1.getClass().getCanonicalName());
}
} else if (ModelSupport.isSubClass(o1, Relation.class)) { } else if (ModelSupport.isSubClass(o1, Relation.class)) {
((Relation) o1).mergeFrom((Relation) o2); ((Relation) o1).mergeFrom((Relation) o2);
} else { } else {
@ -133,6 +134,19 @@ public class GenerateEntitiesApplication {
return o1; return o1;
} }
protected static Result mergeResults(Result o1, Result o2) {
Result r1 = o1;
Result r2 = o2;
if (new ResultTypeComparator().compare(r1, r2) < 0) {
r1.mergeFrom(r2);
return r1;
} else {
r2.mergeFrom(r1);
return r2;
}
}
private static List<Oaf> convertToListOaf( private static List<Oaf> convertToListOaf(
final String id, final String id,
final String s, final String s,

View File

@ -1,16 +1,6 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASET_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION; import static eu.dnetlib.dhp.schema.common.ModelConstants.DATASOURCE_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS; import static eu.dnetlib.dhp.schema.common.ModelConstants.DNET_PROVENANCE_ACTIONS;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION; import static eu.dnetlib.dhp.schema.common.ModelConstants.ENTITYREGISTRY_PROVENANCE_ACTION;
@ -19,19 +9,25 @@ import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PARTICIPANT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PRODUCED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_PROVIDED_BY;
import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO; import static eu.dnetlib.dhp.schema.common.ModelConstants.IS_RELATED_TO;
import static eu.dnetlib.dhp.schema.common.ModelConstants.ORP_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME; import static eu.dnetlib.dhp.schema.common.ModelConstants.OUTCOME;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION; import static eu.dnetlib.dhp.schema.common.ModelConstants.PARTICIPATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES; import static eu.dnetlib.dhp.schema.common.ModelConstants.PRODUCES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION; import static eu.dnetlib.dhp.schema.common.ModelConstants.PROJECT_ORGANIZATION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES; import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVIDES;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION; import static eu.dnetlib.dhp.schema.common.ModelConstants.PROVISION;
import static eu.dnetlib.dhp.schema.common.ModelConstants.PUBLICATION_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP; import static eu.dnetlib.dhp.schema.common.ModelConstants.RELATIONSHIP;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_PROJECT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT; import static eu.dnetlib.dhp.schema.common.ModelConstants.RESULT_RESULT;
import static eu.dnetlib.dhp.schema.common.ModelConstants.SOFTWARE_DEFAULT_RESULTTYPE;
import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM; import static eu.dnetlib.dhp.schema.common.ModelConstants.USER_CLAIM;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.asString;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.journal;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.io.Closeable; import java.io.Closeable;
import java.io.IOException; import java.io.IOException;
@ -445,22 +441,26 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE"); createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
try { try {
final String targetType = rs.getString(TARGET_TYPE);
if (rs.getString(SOURCE_TYPE).equals("context")) { if (rs.getString(SOURCE_TYPE).equals("context")) {
final Result r; final Result r;
if (rs.getString(TARGET_TYPE).equals("dataset")) { switch (targetType) {
r = new Dataset(); case "dataset":
r.setResulttype(DATASET_DEFAULT_RESULTTYPE); r = new Dataset();
} else if (rs.getString(TARGET_TYPE).equals("software")) { break;
r = new Software(); case "software":
r.setResulttype(SOFTWARE_DEFAULT_RESULTTYPE); r = new Software();
} else if (rs.getString(TARGET_TYPE).equals("other")) { break;
r = new OtherResearchProduct(); case "other":
r.setResulttype(ORP_DEFAULT_RESULTTYPE); r = new OtherResearchProduct();
} else { break;
r = new Publication(); case "publication":
r.setResulttype(PUBLICATION_DEFAULT_RESULTTYPE); default:
r = new Publication();
break;
} }
r.setId(createOpenaireId(50, rs.getString("target_id"), false)); r.setId(createOpenaireId(50, rs.getString("target_id"), false));
r.setLastupdatetimestamp(lastUpdateTimestamp); r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info)); r.setContext(prepareContext(rs.getString("source_id"), info));
@ -470,7 +470,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
return Arrays.asList(r); return Arrays.asList(r);
} else { } else {
final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false); final String sourceId = createOpenaireId(rs.getString(SOURCE_TYPE), rs.getString("source_id"), false);
final String targetId = createOpenaireId(rs.getString(TARGET_TYPE), rs.getString("target_id"), false); final String targetId = createOpenaireId(targetType, rs.getString("target_id"), false);
final Relation r1 = new Relation(); final Relation r1 = new Relation();
final Relation r2 = new Relation(); final Relation r2 = new Relation();

View File

@ -1,10 +1,10 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.List; import java.util.List;
@ -19,15 +19,8 @@ import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OafToOafMapper extends AbstractMdRecordToOafMapper { public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@ -93,7 +86,13 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) { protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//dc:description", info); return prepareListFields(doc, "//dc:description", info)
.stream()
.map(d -> {
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
} }
@Override @Override
@ -257,11 +256,9 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Oaf> addOtherResultRels( protected List<Oaf> addOtherResultRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final OafEntity entity) {
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false);
final String docId = entity.getId();
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) { for (final Object o : doc.selectNodes("//*[local-name()='relatedDataset']")) {
@ -275,13 +272,11 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
res res
.add( .add(
getRelation( getRelation(
docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info, docId, otherId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, collectedFrom, info, otherId, docId, RESULT_RESULT, RELATIONSHIP, IS_RELATED_TO, entity));
lastUpdateTimestamp));
} }
} }
return res; return res;
@ -295,6 +290,10 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) { protected List<StructuredProperty> prepareResultPids(final Document doc, final DataInfo info) {
return prepareListStructPropsWithValidQualifier( return prepareListStructPropsWithValidQualifier(
doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info); doc, "//oaf:identifier", "@identifierType", DNET_PID_TYPES, info)
.stream()
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList());
} }
} }

View File

@ -1,34 +1,26 @@
package eu.dnetlib.dhp.oa.graph.raw; package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import static eu.dnetlib.dhp.schema.common.ModelConstants.*; import static eu.dnetlib.dhp.schema.common.ModelConstants.*;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.field;
import static eu.dnetlib.dhp.schema.oaf.OafMapperUtils.structuredProperty;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.common.PacePerson; import eu.dnetlib.dhp.common.PacePerson;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.DataInfo; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
public class OdfToOafMapper extends AbstractMdRecordToOafMapper { public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@ -198,7 +190,13 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) { protected List<Field<String>> prepareDescriptions(final Document doc, final DataInfo info) {
return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info); return prepareListFields(doc, "//datacite:description[@descriptionType='Abstract']", info)
.stream()
.map(d -> {
d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d;
})
.collect(Collectors.toList());
} }
@Override @Override
@ -314,11 +312,9 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
@Override @Override
protected List<Oaf> addOtherResultRels( protected List<Oaf> addOtherResultRels(
final Document doc, final Document doc,
final KeyValue collectedFrom, final OafEntity entity) {
final DataInfo info,
final long lastUpdateTimestamp) {
final String docId = createOpenaireId(50, doc.valueOf("//dri:objIdentifier"), false); final String docId = entity.getId();
final List<Oaf> res = new ArrayList<>(); final List<Oaf> res = new ArrayList<>();
@ -330,30 +326,26 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
final String otherId = createOpenaireId(50, originalId, false); final String otherId = createOpenaireId(50, originalId, false);
final String type = ((Node) o).valueOf("@relationType"); final String type = ((Node) o).valueOf("@relationType");
if (type.equalsIgnoreCase("IsSupplementTo")) { if (type.equalsIgnoreCase(IS_SUPPLEMENT_TO)) {
res res
.add( .add(
getRelation( getRelation(
docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, collectedFrom, info, docId, otherId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENT_TO, entity));
lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, collectedFrom, info, otherId, docId, RESULT_RESULT, SUPPLEMENT, IS_SUPPLEMENTED_BY, entity));
lastUpdateTimestamp)); } else if (type.equalsIgnoreCase(IS_PART_OF)) {
} else if (type.equals("IsPartOf")) {
res res
.add( .add(
getRelation( getRelation(
docId, otherId, RESULT_RESULT, PART, IS_PART_OF, collectedFrom, info, docId, otherId, RESULT_RESULT, PART, IS_PART_OF, entity));
lastUpdateTimestamp));
res res
.add( .add(
getRelation( getRelation(
otherId, docId, RESULT_RESULT, PART, HAS_PARTS, collectedFrom, info, otherId, docId, RESULT_RESULT, PART, HAS_PARTS, entity));
lastUpdateTimestamp));
} else { } else {
// TODO catch more semantics
} }
} }
} }
@ -384,7 +376,11 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
doc, doc,
"//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']", "//datacite:alternateIdentifier[@alternateIdentifierType != 'URL' and @alternateIdentifierType != 'landingPage']",
"@alternateIdentifierType", DNET_PID_TYPES, info)); "@alternateIdentifierType", DNET_PID_TYPES, info));
return Lists.newArrayList(res);
return res
.stream()
.map(CleaningFunctions::normalizePidValue)
.collect(Collectors.toList());
} }
} }

View File

@ -10,6 +10,7 @@ import org.apache.commons.lang3.StringUtils;
import com.google.common.collect.Maps; import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
public class Vocabulary implements Serializable { public class Vocabulary implements Serializable {

View File

@ -7,6 +7,7 @@ import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Qualifier; import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -4,7 +4,6 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser
import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation} import eu.dnetlib.dhp.schema.oaf.{Oaf, Relation}
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication, DLIUnknown}
import eu.dnetlib.dhp.sx.ebi.EBIAggregator import eu.dnetlib.dhp.sx.ebi.EBIAggregator
import eu.dnetlib.dhp.sx.ebi.model.{PMArticle, PMAuthor, PMJournal}
import org.apache.commons.io.IOUtils import org.apache.commons.io.IOUtils
import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession} import org.apache.spark.sql.{Dataset, Encoder, Encoders, SaveMode, SparkSession}
import org.slf4j.LoggerFactory import org.slf4j.LoggerFactory
@ -18,38 +17,38 @@ object SparkSplitOafTODLIEntities {
} }
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
parser.parseArgument(args)
val workingPath: String = parser.get("workingPath") def extract_dataset(spark:SparkSession, workingPath:String) :Unit = {
logger.info(s"Working dir path = $workingPath")
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf].repartition(4000)
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset].repartition(1000)
OAFDataset
.filter(s => s != null && s.isInstanceOf[DLIDataset])
.map(s =>s.asInstanceOf[DLIDataset])
.union(ebi_dataset)
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
.map(p => p._2)
.repartition(2000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
}
def extract_publication(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf] implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
implicit val datEncoder: Encoder[DLIDataset] = Encoders.kryo[DLIDataset]
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
val spark:SparkSession = SparkSession
.builder()
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master(parser.get("master"))
.getOrCreate()
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf] val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
val ebi_dataset:Dataset[DLIDataset] = spark.read.load(s"$workingPath/ebi/baseline_dataset_ebi").as[DLIDataset] val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication].repartition(1000)
val ebi_publication:Dataset[DLIPublication] = spark.read.load(s"$workingPath/ebi/baseline_publication_ebi").as[DLIPublication]
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation]
OAFDataset OAFDataset
@ -60,20 +59,17 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIPublicationAggregator().toColumn) .agg(EBIAggregator.getDLIPublicationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000) .repartition(2000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/publication")
OAFDataset }
.filter(s => s != null && s.isInstanceOf[DLIDataset])
.map(s =>s.asInstanceOf[DLIDataset])
.union(ebi_dataset)
.map(d => (d.getId, d))(Encoders.tuple(Encoders.STRING, datEncoder))
.groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIDatasetAggregator().toColumn)
.map(p => p._2)
.repartition(1000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/dataset")
def extract_unknown(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val unkEncoder: Encoder[DLIUnknown] = Encoders.kryo[DLIUnknown]
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
OAFDataset OAFDataset
.filter(s => s != null && s.isInstanceOf[DLIUnknown]) .filter(s => s != null && s.isInstanceOf[DLIUnknown])
@ -82,9 +78,18 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getDLIUnknownAggregator().toColumn) .agg(EBIAggregator.getDLIUnknownAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/unknown")
}
def extract_relations(spark:SparkSession, workingPath:String) :Unit = {
implicit val oafEncoder: Encoder[Oaf] = Encoders.kryo[Oaf]
implicit val relEncoder: Encoder[Relation] = Encoders.kryo[Relation]
val OAFDataset:Dataset[Oaf] = spark.read.load(s"$workingPath/input/OAFDataset").as[Oaf]
val ebi_relation:Dataset[Relation] = spark.read.load(s"$workingPath/ebi/baseline_relation_ebi").as[Relation].repartition(2000)
OAFDataset OAFDataset
.filter(s => s != null && s.isInstanceOf[Relation]) .filter(s => s != null && s.isInstanceOf[Relation])
@ -94,9 +99,35 @@ object SparkSplitOafTODLIEntities {
.groupByKey(_._1)(Encoders.STRING) .groupByKey(_._1)(Encoders.STRING)
.agg(EBIAggregator.getRelationAggregator().toColumn) .agg(EBIAggregator.getRelationAggregator().toColumn)
.map(p => p._2) .map(p => p._2)
.repartition(1000) .repartition(4000)
.write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation") .write.mode(SaveMode.Overwrite).save(s"$workingPath/graph/relation")
}
def main(args: Array[String]): Unit = {
val parser = new ArgumentApplicationParser(IOUtils.toString(SparkSplitOafTODLIEntities.getClass.getResourceAsStream("/eu/dnetlib/dhp/sx/graph/argumentparser/input_extract_entities_parameters.json")))
val logger = LoggerFactory.getLogger(SparkSplitOafTODLIEntities.getClass)
parser.parseArgument(args)
val workingPath: String = parser.get("workingPath")
val entity:String = parser.get("entity")
logger.info(s"Working dir path = $workingPath")
val spark:SparkSession = SparkSession
.builder()
.appName(SparkSplitOafTODLIEntities.getClass.getSimpleName)
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
.master(parser.get("master"))
.getOrCreate()
entity match {
case "publication" => extract_publication(spark, workingPath)
case "dataset" => extract_dataset(spark,workingPath)
case "relation" => extract_relations(spark, workingPath)
case "unknown" => extract_unknown(spark, workingPath)
}

View File

@ -1,4 +1,5 @@
[ [
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true}, {"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true} {"paramName":"w", "paramLongName":"workingPath", "paramDescription": "the work dir path", "paramRequired": true},
{"paramName":"e", "paramLongName":"entity", "paramDescription": "the work dir path", "paramRequired": true}
] ]

View File

@ -14,30 +14,103 @@
</property> </property>
</parameters> </parameters>
<start to="ExtractDLIEntities"/> <start to="ExtractDLIPublication"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<action name="ExtractDLIEntities"> <action name="ExtractDLIPublication">
<spark xmlns="uri:oozie:spark-action:0.2"> <spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
<master>yarn-cluster</master> <master>yarn-cluster</master>
<mode>cluster</mode> <mode>cluster</mode>
<name>Extract DLI Entities</name> <name>Extract DLI Entities (Publication)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class> <class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar> <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts> <spark-opts>
--executor-memory ${sparkExecutorMemory} --executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores} --executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory} --driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=3840 --conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT} ${sparkExtraOPT}
</spark-opts> </spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg> <arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg> <arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>publication</arg>
</spark>
<ok to="ExtractDLIDataset"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIDataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Dataset)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>dataset</arg>
</spark>
<ok to="ExtractDLIUnknown"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIUnknown">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Unknown)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>unknown</arg>
</spark>
<ok to="ExtractDLIRelation"/>
<error to="Kill"/>
</action>
<action name="ExtractDLIRelation">
<spark xmlns="uri:oozie:spark-action:0.2">
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<master>yarn-cluster</master>
<mode>cluster</mode>
<name>Extract DLI Entities (Relation)</name>
<class>eu.dnetlib.dhp.sx.graph.SparkSplitOafTODLIEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.sql.shuffle.partitions=5000
${sparkExtraOPT}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--workingPath</arg><arg>${workingPath}</arg>
<arg>-e</arg><arg>relation</arg>
</spark> </spark>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>

View File

@ -7,8 +7,6 @@ import static org.mockito.Mockito.lenient;
import java.io.IOException; import java.io.IOException;
import java.util.List; import java.util.List;
import java.util.Set; import java.util.Set;
import java.util.function.Predicate;
import java.util.stream.Collectors;
import java.util.stream.Stream; import java.util.stream.Stream;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;

View File

@ -0,0 +1,99 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.Mockito.lenient;
import java.io.IOException;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class)
public class GenerateEntitiesApplicationTest {
@Mock
private ISLookUpService isLookUpService;
@Mock
private VocabularyGroup vocs;
@BeforeEach
public void setUp() throws IOException, ISLookUpException {
lenient().when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARIES_XQUERY)).thenReturn(vocs());
lenient()
.when(isLookUpService.quickSearchProfile(VocabularyGroup.VOCABULARY_SYNONYMS_XQUERY))
.thenReturn(synonyms());
vocs = VocabularyGroup.loadVocsFromIS(isLookUpService);
}
@Test
public void testMergeResult() throws IOException {
Result publication = getResult("oaf_record.xml", Publication.class);
Result dataset = getResult("odf_dataset.xml", Dataset.class);
Result software = getResult("odf_software.xml", Software.class);
Result orp = getResult("oaf_orp.xml", OtherResearchProduct.class);
verifyMerge(publication, dataset, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(dataset, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(publication, software, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(software, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(publication, orp, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(orp, publication, Publication.class, ModelConstants.PUBLICATION_RESULTTYPE_CLASSID);
verifyMerge(dataset, software, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
verifyMerge(software, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
verifyMerge(dataset, orp, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
verifyMerge(orp, dataset, Dataset.class, ModelConstants.DATASET_RESULTTYPE_CLASSID);
verifyMerge(software, orp, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
verifyMerge(orp, software, Software.class, ModelConstants.SOFTWARE_RESULTTYPE_CLASSID);
}
protected <T extends Result> void verifyMerge(Result publication, Result dataset, Class<T> clazz,
String resultType) {
final Result merge = GenerateEntitiesApplication.mergeResults(publication, dataset);
assertTrue(clazz.isAssignableFrom(merge.getClass()));
assertEquals(resultType, merge.getResulttype().getClassid());
}
protected <T extends Result> Result getResult(String xmlFileName, Class<T> clazz) throws IOException {
final String xml = IOUtils.toString(getClass().getResourceAsStream(xmlFileName));
return new OdfToOafMapper(vocs, false)
.processMdRecord(xml)
.stream()
.filter(s -> clazz.isAssignableFrom(s.getClass()))
.map(s -> (Result) s)
.findFirst()
.get();
}
private List<String> vocs() throws IOException {
return IOUtils
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/terms.txt"));
}
private List<String> synonyms() throws IOException {
return IOUtils
.readLines(CleaningFunctionTest.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/clean/synonyms.txt"));
}
}

View File

@ -24,14 +24,8 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest; import eu.dnetlib.dhp.oa.graph.clean.CleaningFunctionTest;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.common.ModelConstants; import eu.dnetlib.dhp.schema.common.ModelConstants;
import eu.dnetlib.dhp.schema.oaf.Author; import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Dataset; import eu.dnetlib.dhp.schema.oaf.utils.IdentifierFactory;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService; import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
@ExtendWith(MockitoExtension.class) @ExtendWith(MockitoExtension.class)
@ -71,7 +65,7 @@ public class MappersTest {
assertValidId(p.getId()); assertValidId(p.getId());
assertTrue(p.getOriginalId().size() == 1); assertTrue(p.getOriginalId().size() == 2);
assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0)); assertEquals("10.3897/oneeco.2.e13718", p.getOriginalId().get(0));
assertValidId(p.getCollectedfrom().get(0).getKey()); assertValidId(p.getCollectedfrom().get(0).getKey());
@ -123,22 +117,7 @@ public class MappersTest {
assertNotNull(p.getBestaccessright()); assertNotNull(p.getBestaccessright());
assertEquals("OPEN", p.getBestaccessright().getClassid()); assertEquals("OPEN", p.getBestaccessright().getClassid());
assertValidId(r1.getSource()); verifyRelations(p, r1, r2);
assertValidId(r1.getTarget());
assertValidId(r2.getSource());
assertValidId(r2.getTarget());
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
// System.out.println(new ObjectMapper().writeValueAsString(p)); // System.out.println(new ObjectMapper().writeValueAsString(p));
// System.out.println(new ObjectMapper().writeValueAsString(r1)); // System.out.println(new ObjectMapper().writeValueAsString(r1));
@ -177,7 +156,7 @@ public class MappersTest {
final Relation r2 = (Relation) list.get(2); final Relation r2 = (Relation) list.get(2);
assertValidId(d.getId()); assertValidId(d.getId());
assertTrue(d.getOriginalId().size() == 1); assertTrue(d.getOriginalId().size() == 2);
assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0)); assertEquals("oai:zenodo.org:3234526", d.getOriginalId().get(0));
assertValidId(d.getCollectedfrom().get(0).getKey()); assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue())); assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
@ -230,10 +209,19 @@ public class MappersTest {
}); });
assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid()); assertEquals("0001", d.getInstance().get(0).getRefereed().getClassid());
verifyRelations(d, r1, r2);
}
private void verifyRelations(OafEntity e, Relation r1, Relation r2) {
assertEquals(e.getId(), r1.getSource());
assertEquals(e.getId(), r2.getTarget());
assertValidId(r1.getSource()); assertValidId(r1.getSource());
assertValidId(r1.getTarget()); assertValidId(r1.getTarget());
assertValidId(r2.getSource()); assertValidId(r2.getSource());
assertValidId(r2.getTarget()); assertValidId(r2.getTarget());
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
assertNotNull(r1.getDataInfo()); assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo()); assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust()); assertNotNull(r1.getDataInfo().getTrust());

View File

@ -27,10 +27,10 @@ import org.mockito.junit.jupiter.MockitoExtension;
import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils;
import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup; import eu.dnetlib.dhp.oa.graph.raw.common.VocabularyGroup;
import eu.dnetlib.dhp.schema.oaf.Datasource; import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf; import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OafMapperUtils;
import eu.dnetlib.dhp.schema.oaf.Organization; import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project; import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation; import eu.dnetlib.dhp.schema.oaf.Relation;

View File

@ -30,7 +30,7 @@ class SparkScholexplorerAggregationTest {
implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication] implicit val pubEncoder: Encoder[DLIPublication] = Encoders.kryo[DLIPublication]
val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").getOrCreate() val spark: SparkSession = SparkSession.builder().appName("Test").master("local[*]").config("spark.driver.bindAddress", "127.0.0.1").getOrCreate()
val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication] val ds: Dataset[DLIPublication] = spark.createDataset(spark.sparkContext.parallelize(s)).as[DLIPublication]

View File

@ -0,0 +1,83 @@
<?xml version="1.0" encoding="UTF-8"?>
<record xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:dr="http://www.driver-repository.eu/namespace/dr"
xmlns:dri="http://www.driver-repository.eu/namespace/dri"
xmlns:oaf="http://namespace.openaire.eu/oaf"
xmlns:prov="http://www.openarchives.org/OAI/2.0/provenance" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<header xmlns="http://namespace.openaire.eu/">
<dri:objIdentifier>pensoft_____::00ea4a1cd53806a97d62ea6bf268f2a2</dri:objIdentifier>
<dri:recordIdentifier>10.3897/oneeco.2.e13718</dri:recordIdentifier>
<dri:dateOfCollection/>
<dri:mdFormat/>
<dri:mdFormatInterpretation/>
<dri:repositoryId/>
<dr:objectIdentifier/>
<dr:dateOfCollection>2020-03-23T00:20:51.392Z</dr:dateOfCollection>
<dr:dateOfTransformation>2020-03-23T00:26:59.078Z</dr:dateOfTransformation>
<oaf:datasourceprefix>pensoft_____</oaf:datasourceprefix>
</header>
<metadata xmlns="http://namespace.openaire.eu/">
<dc:title>Ecosystem Service capacity is higher in areas of multiple designation types</dc:title>
<dc:creator>Nikolaidou,Charitini</dc:creator>
<dc:creator nameIdentifier="0000-0001-6651-1178" nameIdentifierScheme="ORCID">Votsi,Nefta</dc:creator>
<dc:creator>Sgardelis,Steanos</dc:creator>
<dc:creator>Halley,John</dc:creator>
<dc:creator>Pantis,John</dc:creator>
<dc:creator>Tsiafouli,Maria</dc:creator>
<dc:date>2017</dc:date>
<dc:description>The implementation of the Ecosystem Service (ES) concept into practice might be a challenging task as it has to take into account previous “traditional” policies and approaches that have evaluated nature and biodiversity differently. Among them the Habitat (92/43/EC) and Bird Directives (79/409/EC), the Water Framework Directive (2000/60/EC), and the Noise Directive (2002/49/EC) have led to the evaluation/designation of areas in Europe with different criteria. In this study our goal was to understand how the ES capacity of an area is related to its designation and if areas with multiple designations have higher capacity in providing ES. We selected four catchments in Greece with a great variety of characteristics covering over 25% of the national territory. Inside the catchments we assessed the ES capacity (following the methodology of Burkhard et al. 2009) of areas designated as Natura 2000 sites, Quiet areas and Wetlands or Water bodies and found those areas that have multiple designations. Data were analyzed by GLM to reveal differences regarding the ES capacity among the different types of areas. We also investigated by PCA synergies and trade-offs among different kinds of ES and tested for correlations among landscape properties, such as elevation, aspect and slope and the ES potential. Our results show that areas with different types or multiple designations have a different capacity in providing ES. Areas of one designation type (Protected or Quiet Areas) had in general intermediate scores in most ES but scores were higher compared to areas with no designation, which displayed stronger capacity in provisioning services. Among Protected Areas and Quiet Areas the latter scored better in general. Areas that combined both designation types (Protected and Quiet Areas) showed the highest capacity in 13 out of 29 ES, that were mostly linked with natural and forest ecosystems. We found significant synergies among most regulating, supporting and cultural ES which in turn display trade-offs with provisioning services. The different ES are spatially related and display strong correlation with landscape properties, such as elevation and slope. We suggest that the designation status of an area can be used as an alternative tool for environmental policy, indicating the capacity for ES provision. Multiple designations of areas can be used as proxies for locating ES “hotspots”. This integration of “traditional” evaluation and designation and the “newer” ES concept forms a time- and cost-effective way to be adopted by stakeholders and policy-makers in order to start complying with new standards and demands for nature conservation and environmental management.</dc:description>
<dc:format>text/html</dc:format>
<dc:identifier>https://doi.org/10.3897/oneeco.2.e13718</dc:identifier>
<dc:identifier>https://oneecosystem.pensoft.net/article/13718/</dc:identifier>
<dc:language>eng</dc:language>
<dc:publisher>Pensoft Publishers</dc:publisher>
<dc:relation>info:eu-repo/semantics/altIdentifier/eissn/2367-8194</dc:relation>
<dc:relation>info:eu-repo/grantAgreement/EC/FP7/226852</dc:relation>
<dc:source>One Ecosystem 2: e13718</dc:source>
<dc:source>One Ecosystem 2: e13718</dc:source>
<dc:source>One Ecosystem 2: e13718</dc:source>
<dc:subject>Ecosystem Services hotspots</dc:subject>
<dc:subject>Natura 2000</dc:subject>
<dc:subject>Quiet Protected Areas</dc:subject>
<dc:subject>Biodiversity</dc:subject>
<dc:subject>Agriculture</dc:subject>
<dc:subject>Elevation</dc:subject>
<dc:subject>Slope</dc:subject>
<dc:subject>Ecosystem Service trade-offs and synergies</dc:subject>
<dc:subject> cultural services</dc:subject>
<dc:subject>provisioning services</dc:subject>
<dc:subject>regulating services</dc:subject>
<dc:subject>supporting services</dc:subject>
<dc:type>Research Artefact</dc:type>
<dr:CobjCategory type="other">0020</dr:CobjCategory>
<oaf:dateAccepted>2017-01-01</oaf:dateAccepted>
<oaf:projectid>corda_______::226852</oaf:projectid>
<oaf:accessrights>OPEN</oaf:accessrights>
<oaf:hostedBy id="openaire____::issn226852" name="One Ecosystem"/>
<oaf:collectedFrom
id="openaire____::45e3c7b69bcee6cc5fa945c9e183deb9" name="Pensoft"/>
<oaf:identifier identifierType="doi">10.3897/oneeco.2.e13718</oaf:identifier>
<oaf:fulltext>https://oneecosystem.pensoft.net/article/13718/</oaf:fulltext>
<oaf:journal eissn="2367-8194" issn="">One Ecosystem</oaf:journal>
<oaf:refereed>0001</oaf:refereed>
</metadata>
<about xmlns:oai="http://www.openarchives.org/OAI/2.0/">
<provenance xmlns="http://www.openarchives.org/OAI/2.0/provenance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/provenance http://www.openarchives.org/OAI/2.0/provenance.xsd">
<originDescription altered="true" harvestDate="2020-03-23T00:20:51.392Z">
<baseURL>http%3A%2F%2Fzookeys.pensoft.net%2Foai.php</baseURL>
<identifier>10.3897/oneeco.2.e13718</identifier>
<datestamp>2017-09-08</datestamp>
<metadataNamespace>http://www.openarchives.org/OAI/2.0/oai_dc/</metadataNamespace>
</originDescription>
</provenance>
<oaf:datainfo>
<oaf:inferred>false</oaf:inferred>
<oaf:deletedbyinference>false</oaf:deletedbyinference>
<oaf:trust>0.9</oaf:trust>
<oaf:inferenceprovenance/>
<oaf:provenanceaction classid="sysimport:crosswalk:repository"
classname="sysimport:crosswalk:repository"
schemeid="dnet:provenanceActions" schemename="dnet:provenanceActions"/>
</oaf:datainfo>
</about>
</record>

View File

@ -62,6 +62,10 @@
<artifactId>dhp-schemas</artifactId> <artifactId>dhp-schemas</artifactId>
<version>${project.version}</version> <version>${project.version}</version>
</dependency> </dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
</dependency>
<dependency> <dependency>
<groupId>org.elasticsearch</groupId> <groupId>org.elasticsearch</groupId>

View File

@ -0,0 +1,111 @@
package eu.dnetlib.dhp.export.zenodo;
import java.io.*;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
public class MakeTar implements Serializable {
private static final Logger log = LoggerFactory.getLogger(MakeTar.class);
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils
.toString(
MakeTar.class
.getResourceAsStream(
"/eu/dnetlib/dhp/export/input_maketar_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
final String outputPath = parser.get("targetPath");
log.info("hdfsPath: {}", outputPath);
final String hdfsNameNode = parser.get("nameNode");
log.info("nameNode: {}", hdfsNameNode);
final String inputPath = parser.get("sourcePath");
log.info("input path : {}", inputPath);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
makeTArArchive(fileSystem, inputPath, outputPath);
}
public static void makeTArArchive(FileSystem fileSystem, String inputPath, String outputPath) throws IOException {
RemoteIterator<LocatedFileStatus> dir_iterator = fileSystem.listLocatedStatus(new Path(inputPath));
while (dir_iterator.hasNext()) {
LocatedFileStatus fileStatus = dir_iterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
String entity = p_string.substring(p_string.lastIndexOf("/") + 1);
write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
}
}
private static void write(FileSystem fileSystem, String inputPath, String outputPath, String dir_name)
throws IOException {
Path hdfsWritePath = new Path(outputPath);
FSDataOutputStream fsDataOutputStream = null;
if (fileSystem.exists(hdfsWritePath)) {
fileSystem.delete(hdfsWritePath, true);
}
fsDataOutputStream = fileSystem.create(hdfsWritePath);
TarArchiveOutputStream ar = new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(inputPath), true);
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name + ".json.gz");
entry.setSize(fileStatus.getLen());
ar.putArchiveEntry(entry);
InputStream is = fileSystem.open(fileStatus.getPath());
BufferedInputStream bis = new BufferedInputStream(is);
int count;
byte data[] = new byte[1024];
while ((count = bis.read(data, 0, data.length)) != -1) {
ar.write(data, 0, count);
}
bis.close();
ar.closeArchiveEntry();
}
}
ar.close();
}
}

View File

@ -0,0 +1,80 @@
package eu.dnetlib.dhp.export.zenodo;
import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.*;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.api.MissingConceptDoiException;
import eu.dnetlib.dhp.common.api.ZenodoAPIClient;
public class SendToZenodoHDFS implements Serializable {
private static final Log log = LogFactory.getLog(SendToZenodoHDFS.class);
public static void main(final String[] args) throws Exception, MissingConceptDoiException {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
SendToZenodoHDFS.class
.getResourceAsStream(
"/eu/dnetlib/dhp/export/upload_zenodo.json")));
parser.parseArgument(args);
final String hdfsPath = parser.get("hdfsPath");
final String hdfsNameNode = parser.get("nameNode");
final String access_token = parser.get("accessToken");
final String connection_url = parser.get("connectionUrl");
final String metadata = parser.get("metadata");
final Boolean newDeposition = Boolean.valueOf(parser.get("newDeposition"));
final String concept_rec_id = Optional
.ofNullable(parser.get("conceptRecordId"))
.orElse(null);
Configuration conf = new Configuration();
conf.set("fs.defaultFS", hdfsNameNode);
FileSystem fileSystem = FileSystem.get(conf);
RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
.listFiles(
new Path(hdfsPath), true);
ZenodoAPIClient zenodoApiClient = new ZenodoAPIClient(connection_url, access_token);
if (newDeposition) {
zenodoApiClient.newDeposition();
} else {
if (concept_rec_id == null) {
throw new MissingConceptDoiException("No concept record id has been provided");
}
zenodoApiClient.newVersion(concept_rec_id);
}
while (fileStatusListIterator.hasNext()) {
LocatedFileStatus fileStatus = fileStatusListIterator.next();
Path p = fileStatus.getPath();
String p_string = p.toString();
if (!p_string.endsWith("_SUCCESS")) {
// String tmp = p_string.substring(0, p_string.lastIndexOf("/"));
String name = p_string.substring(p_string.lastIndexOf("/") + 1);
log.info("Sending information for community: " + name);
FSDataInputStream inputStream = fileSystem.open(p);
zenodoApiClient.uploadIS(inputStream, name, fileStatus.getLen());
}
}
zenodoApiClient.sendMretadata(metadata);
zenodoApiClient.publish();
}
}

View File

@ -0,0 +1,20 @@
[
{
"paramName": "n",
"paramLongName": "nameNode",
"paramDescription": "the Name Node",
"paramRequired": true
},
{
"paramName": "s",
"paramLongName": "sourcePath",
"paramDescription": "the source path",
"paramRequired": true
},
{
"paramName": "t",
"paramLongName": "targetPath",
"paramDescription": "the target path",
"paramRequired": true
}
]

View File

@ -0,0 +1,45 @@
[
{
"paramName":"nd",
"paramLongName":"newDeposition",
"paramDescription": "if it is a new deposition (true) or a new version (false)",
"paramRequired": true
},
{
"paramName":"cri",
"paramLongName":"conceptRecordId",
"paramDescription": "The id of the concept record for a new version",
"paramRequired": false
},
{
"paramName":"hdfsp",
"paramLongName":"hdfsPath",
"paramDescription": "the path of the folder tofind files to send to Zenodo",
"paramRequired": true
},
{
"paramName": "nn",
"paramLongName": "nameNode",
"paramDescription": "the name node",
"paramRequired": true
},
{
"paramName": "at",
"paramLongName": "accessToken",
"paramDescription": "the access token for the deposition",
"paramRequired": false
},
{
"paramName":"cu",
"paramLongName":"connectionUrl",
"paramDescription": "the url to connect to deposit",
"paramRequired": false
},
{
"paramName":"m",
"paramLongName":"metadata",
"paramDescription": "metadata associated to the deposition",
"paramRequired": false
}
]

View File

@ -0,0 +1,48 @@
<configuration>
<property>
<name>jobTracker</name>
<value>yarnRM</value>
</property>
<property>
<name>nameNode</name>
<value>hdfs://nameservice1</value>
</property>
<property>
<name>oozie.use.system.libpath</name>
<value>true</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>spark2</value>
</property>
<property>
<name>oozie.wf.rerun.failnodes</name>
<value>false</value>
</property>
<property>
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration>

View File

@ -0,0 +1,53 @@
<workflow-app name="Send Dump to Zenodo" xmlns="uri:oozie:workflow:0.5">
<parameters>
<property>
<name>sourcePath</name>
<description>the source path</description>
</property>
<property>
<name>targetPath</name>
<description>the target path</description>
</property>
<property>
<name>metadata</name>
<description>the metadata</description>
</property>
</parameters>
<start to="send_zenodo"/>
<kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="MakeTar">
<java>
<job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node>
<main-class>eu.dnetlib.dhp.export.zenodo.MakeTar</main-class>
<arg>-t</arg><arg>${targetPath}</arg>
<arg>-n</arg><arg>${nameNode}</arg>
<arg>-s</arg><arg>${sourcePath}</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<action name="send_zenodo">
<java>
<main-class>eu.dnetlib.dhp.export.zenodo.SendToZenodoHDFS</main-class>
<arg>--hdfsPath</arg><arg>/user/dnet.scholexplorer/scholix/provision/scholix.tar/scholix-2020-10-16.tar</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--accessToken</arg><arg>b6ddrY6b77WxcDEevn9gqVE5sL5sDNjdUijt75W3o7cQo5vpFFI48dMiu8Gv</arg>
<arg>--connectionUrl</arg><arg>https://zenodo.org/api/deposit/depositions</arg>
<arg>--metadata</arg><arg>${metadata}</arg>
<arg>--conceptRecordId</arg><arg>1200252</arg>
<arg>--newDeposition</arg><arg>false</arg>
</java>
<ok to="End"/>
<error to="Kill"/>
</action>
<end name="End"/>
</workflow-app>

View File

@ -3,14 +3,15 @@ package eu.dnetlib.dhp.export
import java.time.LocalDateTime import java.time.LocalDateTime
import java.time.format.DateTimeFormatter import java.time.format.DateTimeFormatter
import eu.dnetlib.dhp.provision.scholix.Scholix
import eu.dnetlib.dhp.provision.scholix.summary.ScholixSummary
import eu.dnetlib.dhp.schema.oaf.Relation import eu.dnetlib.dhp.schema.oaf.Relation
import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication} import eu.dnetlib.dhp.schema.scholexplorer.{DLIDataset, DLIPublication}
import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig} import org.codehaus.jackson.map.{ObjectMapper, SerializationConfig}
import org.junit.jupiter.api.Test import org.junit.jupiter.api.Test
import scala.io.Source import scala.io.Source
import scala.collection.JavaConverters._
class ExportDLITOOAFTest { class ExportDLITOOAFTest {
val mapper = new ObjectMapper() val mapper = new ObjectMapper()
@ -22,12 +23,27 @@ class ExportDLITOOAFTest {
} }
def extractDatasources(s:Scholix):List[String]= {
s.getTarget.getCollectedFrom.asScala.map(c => c.getProvider.getName)(collection.breakOut)
}
def extractDatasources(s:ScholixSummary):List[String] = {
s.getDatasources.asScala.map(c => c.getDatasourceName)(collection.breakOut)
}
@Test @Test
def testMappingRele():Unit = { def testMappingRele():Unit = {
val r:Relation = new Relation val r:Relation = new Relation
r.setSource("60|fbff1d424e045eecf24151a5fe3aa738") r.setSource("60|fbff1d424e045eecf24151a5fe3aa738")
r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877") r.setTarget("50|dedup_wf_001::ec409f09e63347d4e834087fe1483877")
r.setRelType("IsReferencedBy")
val r1 =DLIToOAF.convertDLIRelation(r) val r1 =DLIToOAF.convertDLIRelation(r)
println(r1.getSource, r1.getTarget) println(r1.getSource, r1.getTarget)

View File

@ -164,7 +164,7 @@ public class CreateRelatedEntitiesJob_phase1 {
if (result.getTitle() != null && !result.getTitle().isEmpty()) { if (result.getTitle() != null && !result.getTitle().isEmpty()) {
final StructuredProperty title = result.getTitle().stream().findFirst().get(); final StructuredProperty title = result.getTitle().stream().findFirst().get();
title.setValue(StringUtils.left(title.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); title.setValue(StringUtils.left(title.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
re.setTitle(title); re.setTitle(title);
} }
@ -178,7 +178,7 @@ public class CreateRelatedEntitiesJob_phase1 {
.getInstance() .getInstance()
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.limit(ProvisionConstants.MAX_INSTANCES) .limit(ModelHardLimits.MAX_INSTANCES)
.collect(Collectors.toList())); .collect(Collectors.toList()));
} }

View File

@ -240,15 +240,15 @@ public class CreateRelatedEntitiesJob_phase2 {
List<ExternalReference> refs = r List<ExternalReference> refs = r
.getExternalReference() .getExternalReference()
.stream() .stream()
.limit(ProvisionConstants.MAX_EXTERNAL_ENTITIES) .limit(ModelHardLimits.MAX_EXTERNAL_ENTITIES)
.collect(Collectors.toList()); .collect(Collectors.toList());
r.setExternalReference(refs); r.setExternalReference(refs);
} }
if (r.getAuthor() != null) { if (r.getAuthor() != null) {
List<Author> authors = Lists.newArrayList(); List<Author> authors = Lists.newArrayList();
for (Author a : r.getAuthor()) { for (Author a : r.getAuthor()) {
a.setFullname(StringUtils.left(a.getFullname(), ProvisionConstants.MAX_AUTHOR_FULLNAME_LENGTH)); a.setFullname(StringUtils.left(a.getFullname(), ModelHardLimits.MAX_AUTHOR_FULLNAME_LENGTH));
if (authors.size() < ProvisionConstants.MAX_AUTHORS || hasORCID(a)) { if (authors.size() < ModelHardLimits.MAX_AUTHORS || hasORCID(a)) {
authors.add(a); authors.add(a);
} }
} }
@ -260,7 +260,7 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(d -> { .map(d -> {
d.setValue(StringUtils.left(d.getValue(), ProvisionConstants.MAX_ABSTRACT_LENGTH)); d.setValue(StringUtils.left(d.getValue(), ModelHardLimits.MAX_ABSTRACT_LENGTH));
return d; return d;
}) })
.collect(Collectors.toList()); .collect(Collectors.toList());
@ -272,10 +272,10 @@ public class CreateRelatedEntitiesJob_phase2 {
.stream() .stream()
.filter(Objects::nonNull) .filter(Objects::nonNull)
.map(t -> { .map(t -> {
t.setValue(StringUtils.left(t.getValue(), ProvisionConstants.MAX_TITLE_LENGTH)); t.setValue(StringUtils.left(t.getValue(), ModelHardLimits.MAX_TITLE_LENGTH));
return t; return t;
}) })
.limit(ProvisionConstants.MAX_TITLES) .limit(ModelHardLimits.MAX_TITLES)
.collect(Collectors.toList()); .collect(Collectors.toList());
r.setTitle(titles); r.setTitle(titles);
} }

View File

@ -44,6 +44,7 @@
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
</ARCS> </ARCS>
</NODE> </NODE>
<NODE isStart="true" name="setRawGraphPath" type="SetEnvParameter"> <NODE isStart="true" name="setRawGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the RAW graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
@ -54,31 +55,45 @@
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
</ARCS> </ARCS>
</NODE> </NODE>
<NODE isStart="true" name="setFirstCleanedGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the first CLEANED graph</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">firstCleanedGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/prod_provision/graph/02_graph_first_cleaned</PARAM>
</PARAMETERS>
<ARCS>
<ARC to="waitConfig"/>
</ARCS>
</NODE>
<NODE isStart="true" name="setDedupGraphPath" type="SetEnvParameter"> <NODE isStart="true" name="setDedupGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the DEDUPED graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the DEDUPED graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">dedupGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">dedupGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/02_graph_dedup</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/03_graph_dedup</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
</ARCS> </ARCS>
</NODE> </NODE>
<NODE isStart="true" name="setInferredGraphPath" type="SetEnvParameter"> <NODE isStart="true" name="setInferredGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the INFERRED graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the INFERRED graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">inferredGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">inferredGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/03_graph_inferred</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/04_graph_inferred</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
</ARCS> </ARCS>
</NODE> </NODE>
<NODE isStart="true" name="setConsistentGraphPath" type="SetEnvParameter"> <NODE isStart="true" name="setConsistentGraphPath" type="SetEnvParameter">
<DESCRIPTION>Set the target path to store the CONSISTENCY graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the CONSISTENCY graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">consistentGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">consistentGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/04_graph_consistent</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_consistent</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -89,7 +104,7 @@
<DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the ORCID enriched graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">orcidGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/05_graph_orcid</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_orcid</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -100,7 +115,7 @@
<DESCRIPTION>Set the target path to store the BULK TAGGED graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the BULK TAGGED graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">bulkTaggingGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">bulkTaggingGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/06_graph_bulktagging</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_bulktagging</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -111,7 +126,7 @@
<DESCRIPTION>Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the AFFILIATION from INSTITUTIONAL REPOS graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">affiliationGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">affiliationGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/07_graph_affiliation</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_affiliation</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -122,7 +137,7 @@
<DESCRIPTION>Set the target path to store the COMMUNITY from SELECTED SOURCES graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the COMMUNITY from SELECTED SOURCES graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">communityOrganizationGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">communityOrganizationGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/08_graph_comunity_organization</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_comunity_organization</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -133,7 +148,7 @@
<DESCRIPTION>Set the target path to store the FUNDING from SEMANTIC RELATION graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the FUNDING from SEMANTIC RELATION graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">fundingGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">fundingGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/09_graph_funding</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_funding</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -144,7 +159,7 @@
<DESCRIPTION>Set the target path to store the COMMUNITY from SEMANTIC RELATION graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the COMMUNITY from SEMANTIC RELATION graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">communitySemRelGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">communitySemRelGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/10_graph_comunity_sem_rel</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_comunity_sem_rel</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -155,7 +170,7 @@
<DESCRIPTION>Set the target path to store the COUNTRY enriched graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the COUNTRY enriched graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">countryGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">countryGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/11_graph_country</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_country</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -166,7 +181,7 @@
<DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the CLEANED graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">cleanedGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/12_graph_cleaned</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/13_graph_cleaned</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -177,7 +192,7 @@
<DESCRIPTION>Set the target path to store the blacklisted graph</DESCRIPTION> <DESCRIPTION>Set the target path to store the blacklisted graph</DESCRIPTION>
<PARAMETERS> <PARAMETERS>
<PARAM managedBy="system" name="parameterName" required="true" type="string">blacklistedGraphPath</PARAM> <PARAM managedBy="system" name="parameterName" required="true" type="string">blacklistedGraphPath</PARAM>
<PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/13_graph_blacklisted</PARAM> <PARAM managedBy="user" name="parameterValue" required="true" type="string">/tmp/beta_provision/graph/14_graph_blacklisted</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS> <ARCS>
<ARC to="waitConfig"/> <ARC to="waitConfig"/>
@ -324,6 +339,31 @@
</PARAM> </PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM> <PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS> </PARAMETERS>
<ARCS>
<ARC to="graphCleaningFirst"/>
</ARCS>
</NODE>
<NODE name="graphCleaningFirst" type="SubmitHadoopJob">
<DESCRIPTION>clean the properties in the graph typed as Qualifier according to the vocabulary indicated in schemeid</DESCRIPTION>
<PARAMETERS>
<PARAM managedBy="system" name="hadoopJob" required="true" type="string">executeOozieJob</PARAM>
<PARAM managedBy="system" name="cluster" required="true" type="string">IIS</PARAM>
<PARAM managedBy="system" name="envParams" required="true" type="string">
{
'graphInputPath' : 'rawGraphPath',
'graphOutputPath': 'firstCleanedGraphPath',
'isLookupUrl': 'isLookUpUrl'
}
</PARAM>
<PARAM managedBy="system" name="params" required="true" type="string">
{
'oozie.wf.application.path' : '/lib/dnet/oa/graph/clean/oozie_app',
'workingPath' : '/tmp/beta_provision/working_dir/first_clean'
}
</PARAM>
<PARAM managedBy="system" name="oozieReportActionsCsv" required="true" type="string">build-report</PARAM>
</PARAMETERS>
<ARCS> <ARCS>
<ARC to="duplicateScan"/> <ARC to="duplicateScan"/>
</ARCS> </ARCS>
@ -337,7 +377,7 @@
<PARAM managedBy="system" name="envParams" required="true" type="string"> <PARAM managedBy="system" name="envParams" required="true" type="string">
{ {
'actionSetId' : 'dedupConfig', 'actionSetId' : 'dedupConfig',
'graphBasePath' : 'rawGraphPath', 'graphBasePath' : 'firstCleanedGraphPath',
'dedupGraphPath': 'dedupGraphPath', 'dedupGraphPath': 'dedupGraphPath',
'isLookUpUrl' : 'isLookUpUrl' 'isLookUpUrl' : 'isLookUpUrl'
} }