master #11

Manually merged
claudio.atzori merged 275 commits from :master into enrichment_wfs 2020-05-11 15:14:56 +02:00
46 changed files with 1661 additions and 671 deletions
Showing only changes of commit 259525cb93 - Show all commits

View File

@ -1,14 +1,25 @@
package eu.dnetlib.dhp.schema.oaf;
import java.io.Serializable;
import java.util.List;
import java.util.Objects;
public abstract class Oaf implements Serializable {
protected List<KeyValue> collectedfrom;
private DataInfo dataInfo;
private Long lastupdatetimestamp;
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
public DataInfo getDataInfo() {
return dataInfo;
}

View File

@ -10,8 +10,6 @@ public abstract class OafEntity extends Oaf implements Serializable {
private List<String> originalId;
private List<KeyValue> collectedfrom;
private List<StructuredProperty> pid;
private String dateofcollection;
@ -38,14 +36,6 @@ public abstract class OafEntity extends Oaf implements Serializable {
this.originalId = originalId;
}
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
public List<StructuredProperty> getPid() {
return pid;
}

View File

@ -18,8 +18,6 @@ public class Relation extends Oaf {
private String target;
private List<KeyValue> collectedFrom = new ArrayList<>();
public String getRelType() {
return relType;
}
@ -60,14 +58,6 @@ public class Relation extends Oaf {
this.target = target;
}
public List<KeyValue> getCollectedFrom() {
return collectedFrom;
}
public void setCollectedFrom(final List<KeyValue> collectedFrom) {
this.collectedFrom = collectedFrom;
}
public void mergeFrom(final Relation r) {
checkArgument(Objects.equals(getSource(), r.getSource()), "source ids must be equal");
@ -77,12 +67,12 @@ public class Relation extends Oaf {
Objects.equals(getSubRelType(), r.getSubRelType()), "subRelType(s) must be equal");
checkArgument(Objects.equals(getRelClass(), r.getRelClass()), "relClass(es) must be equal");
setCollectedFrom(
setCollectedfrom(
Stream.concat(
Optional.ofNullable(getCollectedFrom())
Optional.ofNullable(getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()),
Optional.ofNullable(r.getCollectedFrom())
Optional.ofNullable(r.getCollectedfrom())
.map(Collection::stream)
.orElse(Stream.empty()))
.distinct() // relies on KeyValue.equals
@ -103,6 +93,6 @@ public class Relation extends Oaf {
@Override
public int hashCode() {
return Objects.hash(relType, subRelType, relClass, source, target, collectedFrom);
return Objects.hash(relType, subRelType, relClass, source, target, collectedfrom);
}
}

View File

@ -47,6 +47,16 @@
<artifactId>jaxen</artifactId>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-distcp</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaire-data-protos</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
@ -57,6 +67,44 @@
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-api</artifactId>
</dependency>
<dependency>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-actionmanager-common</artifactId>
<exclusions>
<exclusion>
<groupId>eu.dnetlib</groupId>
<artifactId>dnet-openaireplus-mapping-utils</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon</artifactId>
</exclusion>
<exclusion>
<groupId>saxonica</groupId>
<artifactId>saxon-dom</artifactId>
</exclusion>
<exclusion>
<groupId>jgrapht</groupId>
<artifactId>jgrapht</artifactId>
</exclusion>
<exclusion>
<groupId>net.sf.ehcache</groupId>
<artifactId>ehcache</artifactId>
</exclusion>
<exclusion>
<groupId>org.springframework</groupId>
<artifactId>spring-test</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.*</groupId>
<artifactId>*</artifactId>
</exclusion>
<exclusion>
<groupId>apache</groupId>
<artifactId>*</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp.migration.actions;
package eu.dnetlib.dhp.actionmanager.migration;
import eu.dnetlib.data.proto.FieldTypeProtos.Qualifier;
import java.util.Comparator;

View File

@ -1,4 +1,4 @@
package eu.dnetlib.dhp.migration.actions;
package eu.dnetlib.dhp.actionmanager.migration;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
@ -9,35 +9,36 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.tools.DistCp;
import org.apache.hadoop.tools.DistCpOptions;
import org.apache.hadoop.util.ToolRunner;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class MigrateActionSet {
private static final Log log = LogFactory.getLog(MigrateActionSet.class);
private static final Logger log = LoggerFactory.getLogger(MigrateActionSet.class);
private static final String SEPARATOR = "/";
private static final String TARGET_PATHS = "target_paths";
private static final String RAWSET_PREFIX = "rawset_";
private static Boolean DEFAULT_TRANSFORM_ONLY = false;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser =
new ArgumentApplicationParser(
IOUtils.toString(
MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/migrate_actionsets_parameters.json")));
"/eu/dnetlib/dhp/actionmanager/migration/migrate_actionsets_parameters.json")));
parser.parseArgument(args);
new MigrateActionSet().run(parser);
@ -56,11 +57,11 @@ public class MigrateActionSet {
final String transform_only_s = parser.get("transform_only");
log.info("transform only param: " + transform_only_s);
log.info("transform only param: {}", transform_only_s);
final Boolean transformOnly = Boolean.valueOf(parser.get("transform_only"));
log.info("transform only: " + transformOnly);
log.info("transform only: {}", transformOnly);
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
@ -79,22 +80,19 @@ public class MigrateActionSet {
final List<Path> sourcePaths = getSourcePaths(sourceNN, isLookUp);
log.info(
String.format(
"paths to process:\n%s",
sourcePaths.stream()
.map(p -> p.toString())
.collect(Collectors.joining("\n"))));
"paths to process:\n{}",
sourcePaths.stream().map(p -> p.toString()).collect(Collectors.joining("\n")));
for (Path source : sourcePaths) {
if (!sourceFS.exists(source)) {
log.warn(String.format("skipping unexisting path: %s", source));
log.warn("skipping unexisting path: {}", source);
} else {
LinkedList<String> pathQ =
Lists.newLinkedList(Splitter.on(SEPARATOR).split(source.toUri().getPath()));
final String rawSet = pathQ.pollLast();
log.info(String.format("got RAWSET: %s", rawSet));
log.info("got RAWSET: {}", rawSet);
if (StringUtils.isNotBlank(rawSet) && rawSet.startsWith(RAWSET_PREFIX)) {
@ -109,7 +107,7 @@ public class MigrateActionSet {
+ SEPARATOR
+ rawSet);
log.info(String.format("using TARGET PATH: %s", targetPath));
log.info("using TARGET PATH: {}", targetPath);
if (!transformOnly) {
if (targetFS.exists(targetPath)) {

View File

@ -1,4 +1,9 @@
package eu.dnetlib.dhp.migration.actions;
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.data.proto.KindProtos.Kind.entity;
import static eu.dnetlib.data.proto.KindProtos.Kind.relation;
import static eu.dnetlib.data.proto.TypeProtos.*;
import static eu.dnetlib.data.proto.TypeProtos.Type.*;
import com.google.common.collect.Lists;
import com.googlecode.protobuf.format.JsonFormat;
@ -41,7 +46,7 @@ public class ProtoConverter implements Serializable {
rel.setRelType(r.getRelType().toString());
rel.setSubRelType(r.getSubRelType().toString());
rel.setRelClass(r.getRelClass());
rel.setCollectedFrom(
rel.setCollectedfrom(
r.getCollectedfromCount() > 0
? r.getCollectedfromList().stream()
.map(kv -> mapKV(kv))

View File

@ -0,0 +1,179 @@
package eu.dnetlib.dhp.actionmanager.migration;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Objects;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
public class TransformActions implements Serializable {
private static final Logger log = LoggerFactory.getLogger(TransformActions.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser =
new ArgumentApplicationParser(
IOUtils.toString(
MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/actionmanager/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged =
Optional.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: {}", inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
SparkConf conf = new SparkConf();
runWithSparkSession(
conf,
isSparkSessionManaged,
spark -> transformActions(inputPaths, targetBaseDir, spark));
}
private static void transformActions(
String inputPaths, String targetBaseDir, SparkSession spark) throws IOException {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ =
Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory =
new Path(targetBaseDir + SEPARATOR + actionSetDirectory + SEPARATOR + rawset);
if (fs.exists(targetDirectory)) {
log.info("found target directory '{}", targetDirectory);
fs.delete(targetDirectory, true);
log.info("deleted target directory '{}", targetDirectory);
}
log.info("transforming actions from '{}' to '{}'", sourcePath, targetDirectory);
sc.sequenceFile(sourcePath, Text.class, Text.class)
.map(
a ->
eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(
a._2().toString()))
.map(TransformActions::doTransform)
.filter(Objects::nonNull)
.mapToPair(
a ->
new Tuple2<>(
a.getClazz().toString(),
OBJECT_MAPPER.writeValueAsString(a)))
.mapToPair(t -> new Tuple2(new Text(t._1()), new Text(t._2())))
.saveAsNewAPIHadoopFile(
targetDirectory.toString(),
Text.class,
Text.class,
SequenceFileOutputFormat.class,
sc.hadoopConfiguration());
}
}
private static AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException {
// dedup similarity relations had empty target value, don't migrate them
if (aa.getTargetValue().length == 0) {
return null;
}
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid =
proto_oaf
.getEntity()
.getResult()
.getMetadata()
.getResulttype()
.getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(
OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException(
"invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private static String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY =
"collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
}

View File

@ -0,0 +1,56 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "sn",
"paramLongName": "sourceNameNode",
"paramDescription": "nameNode of the source cluster",
"paramRequired": true
},
{
"paramName": "tn",
"paramLongName": "targetNameNode",
"paramDescription": "namoNode of the target cluster",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workingDirectory",
"paramDescription": "working directory",
"paramRequired": true
},
{
"paramName": "nm",
"paramLongName": "distcp_num_maps",
"paramDescription": "maximum number of map tasks used in the distcp process",
"paramRequired": true
},
{
"paramName": "mm",
"paramLongName": "distcp_memory_mb",
"paramDescription": "memory for distcp action copying actionsets from remote cluster",
"paramRequired": true
},
{
"paramName": "tt",
"paramLongName": "distcp_task_timeout",
"paramDescription": "timeout for distcp copying actions from remote cluster",
"paramRequired": true
},
{
"paramName": "tr",
"paramLongName": "transform_only",
"paramDescription": "activate tranform-only mode. Only apply transformation step",
"paramRequired": true
}
]

View File

@ -0,0 +1,20 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "inputPaths",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
}
]

View File

@ -10,7 +10,6 @@
</property>
<property>
<name>workingDirectory</name>
<value>/tmp/actionsets</value>
<description>working directory</description>
</property>
<property>
@ -44,6 +43,20 @@
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
@ -66,23 +79,27 @@
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
<start to='migrate_actionsets' />
<start to="migrate_actionsets"/>
<action name='migrate_actionsets'>
<action name="migrate_actionsets">
<java>
<main-class>eu.dnetlib.dhp.migration.actions.MigrateActionSet</main-class>
<main-class>eu.dnetlib.dhp.actionmanager.migration.MigrateActionSet</main-class>
<java-opt>-Dmapred.task.timeout=${distcp_task_timeout}</java-opt>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>-sn</arg><arg>${sourceNN}</arg>
<arg>-tn</arg><arg>${nameNode}</arg>
<arg>-w</arg><arg>${workingDirectory}</arg>
<arg>-nm</arg><arg>${distcp_num_maps}</arg>
<arg>-mm</arg><arg>${distcp_memory_mb}</arg>
<arg>-tt</arg><arg>${distcp_task_timeout}</arg>
<arg>-tr</arg><arg>${transform_only}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--sourceNameNode</arg><arg>${sourceNN}</arg>
<arg>--targetNameNode</arg><arg>${nameNode}</arg>
<arg>--workingDirectory</arg><arg>${workingDirectory}</arg>
<arg>--distcp_num_maps</arg><arg>${distcp_num_maps}</arg>
<arg>--distcp_memory_mb</arg><arg>${distcp_memory_mb}</arg>
<arg>--distcp_task_timeout</arg><arg>${distcp_task_timeout}</arg>
<arg>--transform_only</arg><arg>${transform_only}</arg>
<capture-output/>
</java>
<ok to="transform_actions" />
@ -94,19 +111,18 @@
<master>yarn</master>
<mode>cluster</mode>
<name>transform_actions</name>
<class>eu.dnetlib.dhp.migration.actions.TransformActions</class>
<jar>dhp-aggregation-${projectVersion}.jar</jar>
<class>eu.dnetlib.dhp.actionmanager.migration.TransformActions</class>
<jar>dhp-actionmanager-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores ${sparkExecutorCores}
--executor-memory ${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg><arg>yarn</arg>
<arg>-is</arg><arg>${isLookupUrl}</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--inputPaths</arg><arg>${wf:actionData('migrate_actionsets')['target_paths']}</arg>
</spark>
<ok to="end"/>

View File

@ -1,10 +1,10 @@
{"collectedFrom":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::d0bbea1f5bed5864d1904eb602e608a6"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|OpenstarTs__::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::fc7459b8fed8c0d47947fe04275251c0"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|NARCIS__cris::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::c978e29d3b2ddf4f0c2b6e60d6613426"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|MetisRadboud::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::b58bdbe8ae5acead04fc76777d2f8017"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":true,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|MetisRadboud::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|dedup_wf_001::8de0f5a712997aafe0d794a53e51b75a"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|UnityFVG____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::89bab7c5a227fc27b2b9cadf475a6b71"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::007a4870b31056f89b768cf508e1538e"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|VTTRsInSsCrs::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::735915884eb439d42953372eaf934782"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":true,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|dedup_wf_001::9ea9c0996c87e1dc7fc69f94b5ed0010"}
{"collectedFrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","subRelType":"provision","target":"20|openaire____::c24a458004a31f9687089ea3d249de51"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::d0bbea1f5bed5864d1904eb602e608a6"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|OpenstarTs__::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::fc7459b8fed8c0d47947fe04275251c0"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|NARCIS__cris::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::c978e29d3b2ddf4f0c2b6e60d6613426"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":true,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|MetisRadboud::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::b58bdbe8ae5acead04fc76777d2f8017"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":true,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|MetisRadboud::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|dedup_wf_001::8de0f5a712997aafe0d794a53e51b75a"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|UnityFVG____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::89bab7c5a227fc27b2b9cadf475a6b71"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|4ScienceCRIS::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::007a4870b31056f89b768cf508e1538e"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|VTTRsInSsCrs::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|openaire____::735915884eb439d42953372eaf934782"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":true,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CRIS_UNS____::f66f1bd369679b5b077dcdf006089556","subRelType":"provision","target":"20|dedup_wf_001::9ea9c0996c87e1dc7fc69f94b5ed0010"}
{"collectedfrom":[],"dataInfo":{"deletedbyinference":false,"inferenceprovenance":"","inferred":false,"invisible":false,"provenanceaction":{"classid":"UNKNOWN","classname":"UNKNOWN","schemeid":"dnet:provenanceActions","schemename":"dnet:provenanceActions"},"trust":"0.9"},"lastupdatetimestamp":0,"relClass":"provides","relType":"datasourceOrganization","source":"10|CSC_________::a2b9ce8435390bcbfc05f3cae3948747","subRelType":"provision","target":"20|openaire____::c24a458004a31f9687089ea3d249de51"}

View File

@ -1,213 +0,0 @@
package eu.dnetlib.dhp.migration.actions;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.protobuf.InvalidProtocolBufferException;
import eu.dnetlib.data.proto.OafProtos;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.action.AtomicAction;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import java.io.IOException;
import java.io.Serializable;
import java.util.LinkedList;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.SparkSession;
public class TransformActions implements Serializable {
private static final Log log = LogFactory.getLog(TransformActions.class);
private static final String SEPARATOR = "/";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser =
new ArgumentApplicationParser(
IOUtils.toString(
MigrateActionSet.class.getResourceAsStream(
"/eu/dnetlib/dhp/migration/transform_actionsets_parameters.json")));
parser.parseArgument(args);
new TransformActions().run(parser);
}
private void run(ArgumentApplicationParser parser) throws ISLookUpException, IOException {
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: " + isLookupUrl);
final String inputPaths = parser.get("inputPaths");
if (StringUtils.isBlank(inputPaths)) {
throw new RuntimeException("empty inputPaths");
}
log.info("inputPaths: " + inputPaths);
final String targetBaseDir = getTargetBaseDir(isLookupUrl);
try (SparkSession spark = getSparkSession(parser)) {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
for (String sourcePath : Lists.newArrayList(Splitter.on(",").split(inputPaths))) {
LinkedList<String> pathQ =
Lists.newLinkedList(Splitter.on(SEPARATOR).split(sourcePath));
final String rawset = pathQ.pollLast();
final String actionSetDirectory = pathQ.pollLast();
final Path targetDirectory =
new Path(
targetBaseDir
+ SEPARATOR
+ actionSetDirectory
+ SEPARATOR
+ rawset);
if (fs.exists(targetDirectory)) {
log.info(String.format("found target directory '%s", targetDirectory));
fs.delete(targetDirectory, true);
log.info(String.format("deleted target directory '%s", targetDirectory));
}
log.info(
String.format(
"transforming actions from '%s' to '%s'",
sourcePath, targetDirectory));
sc.sequenceFile(sourcePath, Text.class, Text.class)
.map(
a ->
eu.dnetlib.actionmanager.actions.AtomicAction.fromJSON(
a._2().toString()))
.map(a -> doTransform(a))
.filter(Objects::isNull)
.filter(a -> a.getPayload() == null)
.map(a -> new ObjectMapper().writeValueAsString(a))
.saveAsTextFile(targetDirectory.toString(), GzipCodec.class);
}
}
}
private Text transformAction(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException, JsonProcessingException {
final Text out = new Text();
final ObjectMapper mapper = new ObjectMapper();
if (aa.getTargetValue() != null && aa.getTargetValue().length > 0) {
out.set(mapper.writeValueAsString(doTransform(aa)));
}
return out;
}
private AtomicAction<Relation> getRelationAtomicAction(String atomicaActionId) {
final String[] splitId = atomicaActionId.split("@");
String source = splitId[0];
String target = splitId[2];
String[] relSemantic = splitId[1].split("_");
Relation rel = new Relation();
rel.setSource(source);
rel.setTarget(target);
rel.setRelType(relSemantic[0]);
rel.setSubRelType(relSemantic[1]);
rel.setRelClass(relSemantic[2]);
DataInfo d = new DataInfo();
d.setDeletedbyinference(false);
d.setInferenceprovenance("deduplication");
d.setInferred(true);
d.setInvisible(false);
Qualifier provenanceaction = new Qualifier();
provenanceaction.setClassid("deduplication");
provenanceaction.setClassname("deduplication");
provenanceaction.setSchemeid("dnet:provenanceActions");
provenanceaction.setSchemename("dnet:provenanceActions");
d.setProvenanceaction(provenanceaction);
rel.setDataInfo(d);
return new AtomicAction<>(Relation.class, rel);
}
private AtomicAction doTransform(eu.dnetlib.actionmanager.actions.AtomicAction aa)
throws InvalidProtocolBufferException {
final OafProtos.Oaf proto_oaf = OafProtos.Oaf.parseFrom(aa.getTargetValue());
final Oaf oaf = ProtoConverter.convert(proto_oaf);
switch (proto_oaf.getKind()) {
case entity:
switch (proto_oaf.getEntity().getType()) {
case datasource:
return new AtomicAction<>(Datasource.class, (Datasource) oaf);
case organization:
return new AtomicAction<>(Organization.class, (Organization) oaf);
case project:
return new AtomicAction<>(Project.class, (Project) oaf);
case result:
final String resulttypeid =
proto_oaf
.getEntity()
.getResult()
.getMetadata()
.getResulttype()
.getClassid();
switch (resulttypeid) {
case "publication":
return new AtomicAction<>(Publication.class, (Publication) oaf);
case "software":
return new AtomicAction<>(Software.class, (Software) oaf);
case "other":
return new AtomicAction<>(
OtherResearchProduct.class, (OtherResearchProduct) oaf);
case "dataset":
return new AtomicAction<>(Dataset.class, (Dataset) oaf);
default:
// can be an update, where the resulttype is not specified
return new AtomicAction<>(Result.class, (Result) oaf);
}
default:
throw new IllegalArgumentException(
"invalid entity type: " + proto_oaf.getEntity().getType());
}
case relation:
return new AtomicAction<>(Relation.class, (Relation) oaf);
default:
throw new IllegalArgumentException("invalid kind: " + proto_oaf.getKind());
}
}
private String getTargetBaseDir(String isLookupUrl) throws ISLookUpException {
ISLookUpService isLookUp = ISLookupClientFactory.getLookUpService(isLookupUrl);
String XQUERY =
"collection('/db/DRIVER/ServiceResources/ActionManagerServiceResourceType')//SERVICE_PROPERTIES/PROPERTY[@key = 'basePath']/@value/string()";
return isLookUp.getResourceProfileByQuery(XQUERY);
}
private static SparkSession getSparkSession(ArgumentApplicationParser parser) {
SparkConf conf = new SparkConf();
return SparkSession.builder()
.appName(TransformActions.class.getSimpleName())
.master(parser.get("master"))
.config(conf)
.enableHiveSupport()
.getOrCreate();
}
}

View File

@ -1,10 +0,0 @@
[
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
]

View File

@ -1,5 +0,0 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"i", "paramLongName":"inputPaths", "paramDescription": "URL of the isLookUp Service", "paramRequired": true}
]

View File

@ -5,29 +5,29 @@ import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Lists;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.pace.config.DedupConfig;
import java.util.Collection;
import java.util.Iterator;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
public class DedupRecordFactory {
private static final Logger log = LoggerFactory.getLogger(DedupRecordFactory.class);
protected static final ObjectMapper OBJECT_MAPPER =
new com.fasterxml.jackson.databind.ObjectMapper()
.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
public static <T extends OafEntity> Dataset<T> createDedupRecord(
final SparkSession spark,
final String mergeRelsInputPath,
final String entitiesInputPath,
final Class<T> clazz,
final DedupConfig dedupConf) {
final Class<T> clazz) {
long ts = System.currentTimeMillis();
@ -54,40 +54,39 @@ public class DedupRecordFactory {
r -> new Tuple2<>(r.getSource(), r.getTarget()),
Encoders.tuple(Encoders.STRING(), Encoders.STRING()));
// <dedup_id, json_entity_merged>
return mergeRels
.joinWith(entities, mergeRels.col("_1").equalTo(entities.col("_1")), "left_outer")
.filter(
(FilterFunction<Tuple2<Tuple2<String, String>, Tuple2<String, T>>>)
value -> value._2() != null)
.joinWith(entities, mergeRels.col("_2").equalTo(entities.col("_1")), "inner")
.map(
(MapFunction<Tuple2<Tuple2<String, String>, Tuple2<String, T>>, T>)
value -> value._2()._2(),
Encoders.kryo(clazz))
.groupByKey((MapFunction<T, String>) value -> value.getId(), Encoders.STRING())
(MapFunction<
Tuple2<Tuple2<String, String>, Tuple2<String, T>>,
Tuple2<String, T>>)
value -> new Tuple2<>(value._1()._1(), value._2()._2()),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)))
.groupByKey(
(MapFunction<Tuple2<String, T>, String>) entity -> entity._1(),
Encoders.STRING())
.mapGroups(
(MapGroupsFunction<String, T, T>)
(MapGroupsFunction<String, Tuple2<String, T>, T>)
(key, values) -> entityMerger(key, values, ts, clazz),
Encoders.bean(clazz));
}
private static <T extends OafEntity> T entityMerger(
String id, Iterator<T> entities, final long ts, Class<T> clazz) {
String id, Iterator<Tuple2<String, T>> entities, long ts, Class<T> clazz) {
try {
T entity = clazz.newInstance();
entity.setId(id);
if (entity.getDataInfo() == null) {
entity.setDataInfo(new DataInfo());
}
entity.setDataInfo(new DataInfo());
entity.getDataInfo().setTrust("0.9");
entity.setLastupdatetimestamp(ts);
final Collection<String> dates = Lists.newArrayList();
entities.forEachRemaining(
e -> {
entity.mergeFrom(e);
if (ModelSupport.isSubClass(e, Result.class)) {
Result r1 = (Result) e;
t -> {
T duplicate = t._2();
entity.mergeFrom(duplicate);
if (ModelSupport.isSubClass(duplicate, Result.class)) {
Result r1 = (Result) duplicate;
Result er = (Result) entity;
er.setAuthor(DedupUtility.mergeAuthor(er.getAuthor(), r1.getAuthor()));

View File

@ -11,8 +11,6 @@ import eu.dnetlib.pace.config.DedupConfig;
import java.io.IOException;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.dom4j.DocumentException;
@ -71,13 +69,10 @@ public class SparkCreateDedupRecord extends AbstractSparkAction {
Class<OafEntity> clazz = ModelSupport.entityTypes.get(EntityType.valueOf(subEntity));
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz, dedupConf)
.map(
(MapFunction<OafEntity, String>)
value -> OBJECT_MAPPER.writeValueAsString(value),
Encoders.STRING())
DedupRecordFactory.createDedupRecord(spark, mergeRelPath, entityPath, clazz)
.write()
.mode(SaveMode.Overwrite)
.option("compression", "gzip")
.json(outputPath);
}
}

View File

@ -33,7 +33,7 @@ public class SparkUpdateEntity extends AbstractSparkAction {
private static final Logger log = LoggerFactory.getLogger(SparkUpdateEntity.class);
final String IDJSONPATH = "$.id";
private static final String IDJSONPATH = "$.id";
public SparkUpdateEntity(ArgumentApplicationParser parser, SparkSession spark) {
super(parser, spark);
@ -65,27 +65,25 @@ public class SparkUpdateEntity extends AbstractSparkAction {
log.info("workingPath: '{}'", workingPath);
log.info("dedupGraphPath: '{}'", dedupGraphPath);
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
// for each entity
ModelSupport.entityTypes.forEach(
(entity, clazz) -> {
final String outputPath = dedupGraphPath + "/" + entity;
(type, clazz) -> {
final String outputPath = dedupGraphPath + "/" + type;
removeOutputDir(spark, outputPath);
JavaRDD<String> sourceEntity =
sc.textFile(
DedupUtility.createEntityPath(
graphBasePath, entity.toString()));
DedupUtility.createEntityPath(graphBasePath, type.toString()));
if (mergeRelExists(workingPath, entity.toString())) {
if (mergeRelExists(workingPath, type.toString())) {
final String mergeRelPath =
DedupUtility.createMergeRelPath(
workingPath, "*", entity.toString());
DedupUtility.createMergeRelPath(workingPath, "*", type.toString());
final String dedupRecordPath =
DedupUtility.createDedupRecordPath(
workingPath, "*", entity.toString());
workingPath, "*", type.toString());
final Dataset<Relation> rel =
spark.read().load(mergeRelPath).as(Encoders.bean(Relation.class));
@ -107,7 +105,6 @@ public class SparkUpdateEntity extends AbstractSparkAction {
MapDocumentUtil.getJPathString(
IDJSONPATH, s),
s));
JavaRDD<String> map =
entitiesWithId
.leftOuterJoin(mergedIds)

View File

@ -8,6 +8,7 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import eu.dnetlib.pace.util.MapDocumentUtil;
import java.io.File;
import java.io.IOException;
import java.io.Serializable;
@ -16,14 +17,20 @@ import java.nio.file.Paths;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.jupiter.MockitoExtension;
import scala.Tuple2;
@ExtendWith(MockitoExtension.class)
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
@ -41,7 +48,7 @@ public class SparkDedupTest implements Serializable {
private static final String testActionSetId = "test-orchestrator";
@BeforeAll
private static void cleanUp() throws IOException, URISyntaxException {
public static void cleanUp() throws IOException, URISyntaxException {
testGraphBasePath =
Paths.get(
@ -65,7 +72,7 @@ public class SparkDedupTest implements Serializable {
spark =
SparkSession.builder()
.appName(SparkCreateSimRels.class.getSimpleName())
.appName(SparkDedupTest.class.getSimpleName())
.master("local[*]")
.config(new SparkConf())
.getOrCreate();
@ -74,7 +81,7 @@ public class SparkDedupTest implements Serializable {
}
@BeforeEach
private void setUp() throws IOException, ISLookUpException {
public void setUp() throws IOException, ISLookUpException {
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains(testActionSetId)))
@ -96,6 +103,13 @@ public class SparkDedupTest implements Serializable {
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/pub.curr.conf.json")));
lenient()
.when(isLookUpService.getResourceProfileByQuery(Mockito.contains("software")))
.thenReturn(
IOUtils.toString(
SparkDedupTest.class.getResourceAsStream(
"/eu/dnetlib/dhp/dedup/conf/sw.curr.conf.json")));
}
@Test
@ -109,7 +123,6 @@ public class SparkDedupTest implements Serializable {
"/eu/dnetlib/dhp/oa/dedup/createSimRels_parameters.json")));
parser.parseArgument(
new String[] {
"-mt", "local[*]",
"-i", testGraphBasePath,
"-asi", testActionSetId,
"-la", "lookupurl",
@ -126,9 +139,14 @@ public class SparkDedupTest implements Serializable {
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_simrel")
.count();
long sw_simrel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_simrel")
.count();
assertEquals(3288, orgs_simrel);
assertEquals(3432, orgs_simrel);
assertEquals(7260, pubs_simrel);
assertEquals(344, sw_simrel);
}
@Test
@ -142,7 +160,6 @@ public class SparkDedupTest implements Serializable {
"/eu/dnetlib/dhp/oa/dedup/createCC_parameters.json")));
parser.parseArgument(
new String[] {
"-mt", "local[*]",
"-i", testGraphBasePath,
"-asi", testActionSetId,
"-la", "lookupurl",
@ -159,9 +176,14 @@ public class SparkDedupTest implements Serializable {
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/publication_mergerel")
.count();
long sw_mergerel =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
.count();
assertEquals(1244, orgs_mergerel);
assertEquals(1276, orgs_mergerel);
assertEquals(1460, pubs_mergerel);
assertEquals(288, sw_mergerel);
}
@Test
@ -175,7 +197,6 @@ public class SparkDedupTest implements Serializable {
"/eu/dnetlib/dhp/oa/dedup/createDedupRecord_parameters.json")));
parser.parseArgument(
new String[] {
"-mt", "local[*]",
"-i", testGraphBasePath,
"-asi", testActionSetId,
"-la", "lookupurl",
@ -198,9 +219,13 @@ public class SparkDedupTest implements Serializable {
+ testActionSetId
+ "/publication_deduprecord")
.count();
long sw_deduprecord =
jsc.textFile(testOutputBasePath + "/" + testActionSetId + "/software_deduprecord")
.count();
assertEquals(82, orgs_deduprecord);
assertEquals(66, pubs_deduprecord);
assertEquals(51, sw_deduprecord);
}
@Test
@ -214,7 +239,6 @@ public class SparkDedupTest implements Serializable {
"/eu/dnetlib/dhp/oa/dedup/updateEntity_parameters.json")));
parser.parseArgument(
new String[] {
"-mt", "local[*]",
"-i", testGraphBasePath,
"-w", testOutputBasePath,
"-o", testDedupGraphBasePath
@ -224,6 +248,9 @@ public class SparkDedupTest implements Serializable {
long organizations = jsc.textFile(testDedupGraphBasePath + "/organization").count();
long publications = jsc.textFile(testDedupGraphBasePath + "/publication").count();
long projects = jsc.textFile(testDedupGraphBasePath + "/project").count();
long datasource = jsc.textFile(testDedupGraphBasePath + "/datasource").count();
long softwares = jsc.textFile(testDedupGraphBasePath + "/software").count();
long mergedOrgs =
spark.read()
@ -245,20 +272,40 @@ public class SparkDedupTest implements Serializable {
.distinct()
.count();
long mergedSw =
spark.read()
.load(testOutputBasePath + "/" + testActionSetId + "/software_mergerel")
.as(Encoders.bean(Relation.class))
.where("relClass=='merges'")
.javaRDD()
.map(Relation::getTarget)
.distinct()
.count();
assertEquals(897, publications);
assertEquals(835, organizations);
assertEquals(100, projects);
assertEquals(100, datasource);
assertEquals(200, softwares);
long deletedOrgs =
jsc.textFile(testDedupGraphBasePath + "/organization")
.filter(this::isDeletedByInference)
.count();
long deletedPubs =
jsc.textFile(testDedupGraphBasePath + "/publication")
.filter(this::isDeletedByInference)
.count();
long deletedSw =
jsc.textFile(testDedupGraphBasePath + "/software")
.filter(this::isDeletedByInference)
.count();
assertEquals(mergedOrgs, deletedOrgs);
assertEquals(mergedPubs, deletedPubs);
assertEquals(mergedSw, deletedSw);
}
@Test
@ -272,7 +319,6 @@ public class SparkDedupTest implements Serializable {
"/eu/dnetlib/dhp/oa/dedup/propagateRelation_parameters.json")));
parser.parseArgument(
new String[] {
"-mt", "local[*]",
"-i", testGraphBasePath,
"-w", testOutputBasePath,
"-o", testDedupGraphBasePath
@ -283,6 +329,43 @@ public class SparkDedupTest implements Serializable {
long relations = jsc.textFile(testDedupGraphBasePath + "/relation").count();
assertEquals(826, relations);
// check deletedbyinference
final Dataset<Relation> mergeRels =
spark.read()
.load(DedupUtility.createMergeRelPath(testOutputBasePath, "*", "*"))
.as(Encoders.bean(Relation.class));
final JavaPairRDD<String, String> mergedIds =
mergeRels
.where("relClass == 'merges'")
.select(mergeRels.col("target"))
.distinct()
.toJavaRDD()
.mapToPair(
(PairFunction<Row, String, String>)
r -> new Tuple2<String, String>(r.getString(0), "d"));
JavaRDD<String> toCheck =
jsc.textFile(testDedupGraphBasePath + "/relation")
.mapToPair(
json ->
new Tuple2<>(
MapDocumentUtil.getJPathString("$.source", json),
json))
.join(mergedIds)
.map(t -> t._2()._1())
.mapToPair(
json ->
new Tuple2<>(
MapDocumentUtil.getJPathString("$.target", json),
json))
.join(mergedIds)
.map(t -> t._2()._1());
long deletedbyinference = toCheck.filter(this::isDeletedByInference).count();
long updated = toCheck.count();
assertEquals(updated, deletedbyinference);
}
@AfterAll

View File

@ -0,0 +1,118 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "dataset",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : ["result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree" : {
"start" : {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2" : {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3" : {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'].value"
},
{
"name" : "pid",
"type" : "JSON",
"path" : "$.pid",
"overrideMatch" : "true"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"length" : 250,
"size" : 5
},
{
"name" : "authors",
"type" : "String",
"path" : "$.author[*].fullname",
"size" : 200
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms" : {}
}
}

View File

@ -0,0 +1,118 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "otherresearchproduct",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi" ], "params" : { } }
],
"decisionTree" : {
"start" : {
"fields": [
{
"field": "pid",
"comparator": "jsonListMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {
"jpath_value": "$.value",
"jpath_classid": "$.qualifier.classid"
}
}
],
"threshold": 0.5,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "true"
},
"layer2" : {
"fields": [
{
"field": "title",
"comparator": "titleVersionMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
},
{
"field": "authors",
"comparator": "sizeMatch",
"weight": 1.0,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1.0,
"aggregation": "AND",
"positive": "layer3",
"negative": "NO_MATCH",
"undefined": "layer3",
"ignoreUndefined": "false"
},
"layer3" : {
"fields": [
{
"field": "title",
"comparator": "levensteinTitle",
"weight": 1.0,
"countIfUndefined": "true"
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "true"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[@.qualifier.classid = 'doi'}].value"
},
{
"name" : "pid",
"type" : "JSON",
"path" : "$.pid",
"overrideMatch" : "true"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[@.qualifier.classid = 'main title'].value",
"length" : 250,
"size" : 5
},
{
"name" : "authors",
"type" : "String",
"path" : "$.author[*].fullname",
"size" : 200
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms" : {}
}
}

View File

@ -0,0 +1,92 @@
{
"wf" : {
"threshold" : "0.99",
"dedupRun" : "001",
"entityType" : "result",
"subEntityType" : "resulttype",
"subEntityValue" : "software",
"orderField" : "title",
"queueMaxSize" : "2000",
"groupMaxSize" : "100",
"maxChildren" : "100",
"slidingWindowSize" : "200",
"rootBuilder" : [ "result", "resultProject_outcome_isProducedBy", "resultResult_publicationDataset_isRelatedTo", "resultResult_similarity_isAmongTopNSimilarDocuments", "resultResult_similarity_hasAmongTopNSimilarDocuments", "resultOrganization_affiliation_hasAuthorInstitution", "resultResult_part_hasPart", "resultResult_part_isPartOf", "resultResult_supplement_isSupplementTo", "resultResult_supplement_isSupplementedBy", "resultResult_version_isVersionOf" ],
"includeChildren" : "true"
},
"pace" : {
"clustering" : [
{ "name" : "ngrampairs", "fields" : [ "title" ], "params" : { "max" : "1", "ngramLen" : "3"} },
{ "name" : "suffixprefix", "fields" : [ "title" ], "params" : { "max" : "1", "len" : "3" } },
{ "name" : "lowercase", "fields" : [ "doi", "url" ], "params" : { } }
],
"decisionTree": {
"start": {
"fields": [
{
"field": "doi",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
},
{
"field": "url",
"comparator": "exactMatch",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 1,
"aggregation": "OR",
"positive": "MATCH",
"negative": "layer2",
"undefined": "layer2",
"ignoreUndefined": "false"
},
"layer2": {
"fields": [
{
"field": "title",
"comparator": "levensteinTitleIgnoreVersion",
"weight": 1,
"countIfUndefined": "false",
"params": {}
}
],
"threshold": 0.99,
"aggregation": "AVG",
"positive": "MATCH",
"negative": "NO_MATCH",
"undefined": "NO_MATCH",
"ignoreUndefined": "false"
}
},
"model" : [
{
"name" : "doi",
"type" : "String",
"path" : "$.pid[?(@.qualifier.classid == 'doi')].value"
},
{
"name" : "title",
"type" : "String",
"path" : "$.title[?(@.qualifier.classid == 'main title')].value",
"length" : 250,
"size" : 5
},
{
"name" : "url",
"type" : "String",
"path" : "$.instance.url"
},
{
"name" : "resulttype",
"type" : "String",
"path" : "$.resulttype.classid"
}
],
"blacklists" : {},
"synonyms": {}
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -14,6 +14,7 @@
<SCAN_SEQUENCE>
<SCAN id="organization"/>
<SCAN id="publication"/>
<SCAN id="software"/>
</SCAN_SEQUENCE>
</DEDUPLICATION>
</CONFIGURATION>

View File

@ -1,9 +1,38 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.keyValue;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.oaiIProvenance;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.*;
import eu.dnetlib.dhp.schema.oaf.Author;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.dom4j.Document;
import org.dom4j.DocumentFactory;
@ -29,6 +58,12 @@ public abstract class AbstractMdRecordToOafMapper {
qualifier("software", "software", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier OTHER_RESULTTYPE_QUALIFIER =
qualifier("other", "other", "dnet:result_typologies", "dnet:result_typologies");
protected static final Qualifier REPOSITORY_QUALIFIER =
qualifier(
"sysimport:crosswalk:repository",
"sysimport:crosswalk:repository",
"dnet:provenanceActions",
"dnet:provenanceActions");
protected AbstractMdRecordToOafMapper(final Map<String, String> code2name) {
this.code2name = code2name;
@ -55,13 +90,13 @@ public abstract class AbstractMdRecordToOafMapper {
final String type = doc.valueOf("//dr:CobjCategory/@type");
final KeyValue collectedFrom =
keyValue(
doc.valueOf("//oaf:collectedFrom/@id"),
createOpenaireId(10, doc.valueOf("//oaf:collectedFrom/@id"), true),
doc.valueOf("//oaf:collectedFrom/@name"));
final KeyValue hostedBy =
StringUtils.isBlank(doc.valueOf("//oaf:hostedBy/@id"))
? collectedFrom
: keyValue(
doc.valueOf("//oaf:hostedBy/@id"),
createOpenaireId(10, doc.valueOf("//oaf:hostedBy/@id"), true),
doc.valueOf("//oaf:hostedBy/@name"));
final DataInfo info = prepareDataInfo(doc);
@ -154,7 +189,7 @@ public abstract class AbstractMdRecordToOafMapper {
r1.setRelClass("isProducedBy");
r1.setSource(docId);
r1.setTarget(projectId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setCollectedfrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
@ -165,7 +200,7 @@ public abstract class AbstractMdRecordToOafMapper {
r2.setRelClass("produces");
r2.setSource(projectId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setCollectedfrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);
@ -398,7 +433,7 @@ public abstract class AbstractMdRecordToOafMapper {
final Node n = doc.selectSingleNode("//oaf:datainfo");
if (n == null) {
return null;
return dataInfo(false, null, false, false, REPOSITORY_QUALIFIER, "0.9");
}
final String paClassId = n.valueOf("./oaf:provenanceaction/@classid");

View File

@ -1,11 +1,35 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.asString;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.dataInfo;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.journal;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listFields;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.listKeyValues;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.qualifier;
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.structuredProperty;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Context;
import eu.dnetlib.dhp.schema.oaf.DataInfo;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.Software;
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
import java.io.Closeable;
import java.io.IOException;
import java.sql.Array;
@ -119,7 +143,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
ds.setOriginalId(Arrays.asList(rs.getString("datasourceid")));
ds.setCollectedfrom(
listKeyValues(
rs.getString("collectedfromid"), rs.getString("collectedfromname")));
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
ds.setPid(new ArrayList<>());
ds.setDateofcollection(asString(rs.getDate("dateofcollection")));
ds.setDateoftransformation(null); // Value not returned by the SQL query
@ -185,7 +210,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
p.setOriginalId(Arrays.asList(rs.getString("projectid")));
p.setCollectedfrom(
listKeyValues(
rs.getString("collectedfromid"), rs.getString("collectedfromname")));
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
p.setPid(new ArrayList<>());
p.setDateofcollection(asString(rs.getDate("dateofcollection")));
p.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
@ -240,7 +266,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
o.setOriginalId(Arrays.asList(rs.getString("organizationid")));
o.setCollectedfrom(
listKeyValues(
rs.getString("collectedfromid"), rs.getString("collectedfromname")));
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname")));
o.setPid(new ArrayList<>());
o.setDateofcollection(asString(rs.getDate("dateofcollection")));
o.setDateoftransformation(asString(rs.getDate("dateoftransformation")));
@ -285,7 +312,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final String dsId = createOpenaireId(10, rs.getString("datasource"), true);
final List<KeyValue> collectedFrom =
listKeyValues(
rs.getString("collectedfromid"), rs.getString("collectedfromname"));
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("datasourceOrganization");
@ -293,7 +321,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
r1.setRelClass("isProvidedBy");
r1.setSource(dsId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
@ -303,7 +331,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
r2.setRelClass("provides");
r2.setSource(orgId);
r2.setTarget(dsId);
r2.setCollectedFrom(collectedFrom);
r2.setCollectedfrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
@ -320,7 +348,8 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final String projectId = createOpenaireId(40, rs.getString("project"), true);
final List<KeyValue> collectedFrom =
listKeyValues(
rs.getString("collectedfromid"), rs.getString("collectedfromname"));
createOpenaireId(10, rs.getString("collectedfromid"), true),
rs.getString("collectedfromname"));
final Relation r1 = new Relation();
r1.setRelType("projectOrganization");
@ -328,7 +357,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
r1.setRelClass("isParticipant");
r1.setSource(projectId);
r1.setTarget(orgId);
r1.setCollectedFrom(collectedFrom);
r1.setCollectedfrom(collectedFrom);
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
@ -338,7 +367,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
r2.setRelClass("hasParticipant");
r2.setSource(orgId);
r2.setTarget(projectId);
r2.setCollectedFrom(collectedFrom);
r2.setCollectedfrom(collectedFrom);
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
@ -363,6 +392,9 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
"dnet:provenanceActions"),
"0.9");
final List<KeyValue> collectedFrom =
listKeyValues(createOpenaireId(10, "infrastruct_::openaire", true), "OpenAIRE");
try {
if (rs.getString("source_type").equals("context")) {
@ -381,6 +413,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
r.setLastupdatetimestamp(lastUpdateTimestamp);
r.setContext(prepareContext(rs.getString("source_id"), info));
r.setDataInfo(info);
r.setCollectedfrom(collectedFrom);
return Arrays.asList(r);
} else {
@ -395,18 +428,22 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication
final Relation r2 = new Relation();
if (rs.getString("source_type").equals("project")) {
r1.setCollectedfrom(collectedFrom);
r1.setRelType("resultProject");
r1.setSubRelType("outcome");
r1.setRelClass("produces");
r2.setCollectedfrom(collectedFrom);
r2.setRelType("resultProject");
r2.setSubRelType("outcome");
r2.setRelClass("isProducedBy");
} else {
r1.setCollectedfrom(collectedFrom);
r1.setRelType("resultResult");
r1.setSubRelType("relationship");
r1.setRelClass("isRelatedTo");
r2.setCollectedfrom(collectedFrom);
r2.setRelType("resultResult");
r2.setSubRelType("relationship");
r2.setRelClass("isRelatedTo");

View File

@ -232,7 +232,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
r1.setRelClass("isRelatedTo");
r1.setSource(docId);
r1.setTarget(otherId);
r1.setCollectedFrom(Arrays.asList(collectedFrom));
r1.setCollectedfrom(Arrays.asList(collectedFrom));
r1.setDataInfo(info);
r1.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r1);
@ -243,7 +243,7 @@ public class OafToOafMapper extends AbstractMdRecordToOafMapper {
r2.setRelClass("isRelatedTo");
r2.setSource(otherId);
r2.setTarget(docId);
r2.setCollectedFrom(Arrays.asList(collectedFrom));
r2.setCollectedfrom(Arrays.asList(collectedFrom));
r2.setDataInfo(info);
r2.setLastupdatetimestamp(lastUpdateTimestamp);
res.add(r2);

View File

@ -334,7 +334,7 @@ public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
r.setRelClass(relClass);
r.setSource(source);
r.setTarget(target);
r.setCollectedFrom(Arrays.asList(collectedFrom));
r.setCollectedfrom(Arrays.asList(collectedFrom));
r.setDataInfo(info);
r.setLastupdatetimestamp(lastUpdateTimestamp);
return r;

View File

@ -186,7 +186,7 @@ public abstract class AbstractScholexplorerParser {
r.setTarget(targetId);
r.setRelType(relationSemantic);
r.setRelClass("datacite");
r.setCollectedFrom(parsedObject.getCollectedfrom());
r.setCollectedfrom(parsedObject.getCollectedfrom());
r.setDataInfo(di);
rels.add(r);
r = new DLIRelation();
@ -195,7 +195,7 @@ public abstract class AbstractScholexplorerParser {
r.setTarget(parsedObject.getId());
r.setRelType(inverseRelation);
r.setRelClass("datacite");
r.setCollectedFrom(parsedObject.getCollectedfrom());
r.setCollectedfrom(parsedObject.getCollectedfrom());
r.setDateOfCollection(dateOfCollection);
rels.add(r);
if ("unknown".equalsIgnoreCase(relatedType))

View File

@ -21,6 +21,28 @@
<name>sparkExecutorCores</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
</property>
<property>
<name>spark2EventLogDir</name>
<description>spark 2.* event log dir location</description>
</property>
</parameters>
<global>
@ -35,6 +57,10 @@
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
</property>
<property>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
@ -52,14 +78,15 @@
<class>eu.dnetlib.dhp.oa.graph.GraphHiveImporterJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory ${sparkExecutorMemory}
--executor-cores ${sparkExecutorCores}
--executor-memory=${sparkExecutorMemory}
--executor-cores=${sparkExecutorCores}
--driver-memory=${sparkDriverMemory}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
</spark-opts>
<arg>-mt</arg> <arg>yarn-cluster</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--hive_db_name</arg><arg>${hive_db_name}</arg>
<arg>--hive_metastore_uris</arg><arg>${hive_metastore_uris}</arg>

View File

@ -1,11 +1,16 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.when;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Dataset;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Publication;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Software;
import java.io.IOException;
import java.util.List;
import java.util.Map;
@ -43,6 +48,7 @@ public class MappersTest {
final Relation r2 = (Relation) list.get(2);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(p.getTitle().get(0).getValue()));
assertTrue(p.getAuthor().size() > 0);
assertTrue(p.getSubject().size() > 0);
@ -50,13 +56,24 @@ public class MappersTest {
assertTrue(StringUtils.isNotBlank(p.getJournal().getName()));
assertValidId(r1.getSource());
assertValidId(r1.getTarget());
assertValidId(r2.getSource());
assertValidId(r2.getTarget());
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
// System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2));
}
@Test
@ -65,15 +82,35 @@ public class MappersTest {
final List<Oaf> list = new OdfToOafMapper(code2name).processMdRecord(xml);
assertEquals(1, list.size());
assertEquals(3, list.size());
assertTrue(list.get(0) instanceof Dataset);
assertTrue(list.get(1) instanceof Relation);
assertTrue(list.get(2) instanceof Relation);
final Dataset d = (Dataset) list.get(0);
final Relation r1 = (Relation) list.get(1);
final Relation r2 = (Relation) list.get(2);
assertValidId(d.getId());
assertValidId(d.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(d.getTitle().get(0).getValue()));
assertTrue(d.getAuthor().size() > 0);
assertTrue(d.getSubject().size() > 0);
assertValidId(r1.getSource());
assertValidId(r1.getTarget());
assertValidId(r2.getSource());
assertValidId(r2.getTarget());
assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
}
@Test
@ -88,6 +125,7 @@ public class MappersTest {
final Software s = (Software) list.get(0);
assertValidId(s.getId());
assertValidId(s.getCollectedfrom().get(0).getKey());
assertTrue(StringUtils.isNotBlank(s.getTitle().get(0).getValue()));
assertTrue(s.getAuthor().size() > 0);
assertTrue(s.getSubject().size() > 0);

View File

@ -1,10 +1,17 @@
package eu.dnetlib.dhp.oa.graph.raw;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import com.fasterxml.jackson.core.type.TypeReference;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.schema.oaf.*;
import eu.dnetlib.dhp.schema.oaf.Datasource;
import eu.dnetlib.dhp.schema.oaf.Oaf;
import eu.dnetlib.dhp.schema.oaf.Organization;
import eu.dnetlib.dhp.schema.oaf.Project;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import java.io.IOException;
import java.sql.Array;
import java.sql.Date;
@ -13,6 +20,7 @@ import java.sql.SQLException;
import java.util.List;
import java.util.Objects;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.extension.ExtendWith;
@ -42,14 +50,13 @@ public class MigrateDbEntitiesApplicationTest {
final Datasource ds = (Datasource) list.get(0);
assertValidId(ds.getId());
assertValidId(ds.getCollectedfrom().get(0).getKey());
assertEquals(ds.getOfficialname().getValue(), getValueAsString("officialname", fields));
assertEquals(ds.getEnglishname().getValue(), getValueAsString("englishname", fields));
assertEquals(ds.getContactemail().getValue(), getValueAsString("contactemail", fields));
assertEquals(ds.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
assertEquals(
ds.getNamespaceprefix().getValue(), getValueAsString("namespaceprefix", fields));
assertEquals(
ds.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(
ds.getCollectedfrom().get(0).getValue(),
getValueAsString("collectedfromname", fields));
@ -65,10 +72,9 @@ public class MigrateDbEntitiesApplicationTest {
final Project p = (Project) list.get(0);
assertValidId(p.getId());
assertValidId(p.getCollectedfrom().get(0).getKey());
assertEquals(p.getAcronym().getValue(), getValueAsString("acronym", fields));
assertEquals(p.getTitle().getValue(), getValueAsString("title", fields));
assertEquals(
p.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(
p.getCollectedfrom().get(0).getValue(),
getValueAsString("collectedfromname", fields));
@ -86,6 +92,7 @@ public class MigrateDbEntitiesApplicationTest {
final Organization o = (Organization) list.get(0);
assertValidId(o.getId());
assertValidId(o.getCollectedfrom().get(0).getKey());
assertEquals(o.getLegalshortname().getValue(), getValueAsString("legalshortname", fields));
assertEquals(o.getLegalname().getValue(), getValueAsString("legalname", fields));
assertEquals(o.getWebsiteurl().getValue(), getValueAsString("websiteurl", fields));
@ -98,8 +105,6 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(
o.getCountry().getSchemename(),
getValueAsString("country", fields).split("@@@")[3]);
assertEquals(
o.getCollectedfrom().get(0).getKey(), getValueAsString("collectedfromid", fields));
assertEquals(
o.getCollectedfrom().get(0).getValue(),
getValueAsString("collectedfromname", fields));
@ -137,6 +142,8 @@ public class MigrateDbEntitiesApplicationTest {
assertValidId(r2.getSource());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
}
@Test
@ -146,7 +153,12 @@ public class MigrateDbEntitiesApplicationTest {
final List<Oaf> list = app.processClaims(rs);
assertEquals(1, list.size());
assertTrue(list.get(0) instanceof Result);
final Result r = (Result) list.get(0);
verifyMocks(fields);
assertValidId(r.getCollectedfrom().get(0).getKey());
}
@Test
@ -157,6 +169,33 @@ public class MigrateDbEntitiesApplicationTest {
assertEquals(2, list.size());
verifyMocks(fields);
assertTrue(list.get(0) instanceof Relation);
assertTrue(list.get(1) instanceof Relation);
final Relation r1 = (Relation) list.get(0);
final Relation r2 = (Relation) list.get(1);
assertValidId(r1.getSource());
assertValidId(r1.getTarget());
assertValidId(r2.getSource());
assertValidId(r2.getTarget());
assertNotNull(r1.getDataInfo());
assertNotNull(r2.getDataInfo());
assertNotNull(r1.getDataInfo().getTrust());
assertNotNull(r2.getDataInfo().getTrust());
assertEquals(r1.getSource(), r2.getTarget());
assertEquals(r2.getSource(), r1.getTarget());
assertTrue(StringUtils.isNotBlank(r1.getRelClass()));
assertTrue(StringUtils.isNotBlank(r2.getRelClass()));
assertTrue(StringUtils.isNotBlank(r1.getRelType()));
assertTrue(StringUtils.isNotBlank(r2.getRelType()));
assertValidId(r1.getCollectedfrom().get(0).getKey());
assertValidId(r2.getCollectedfrom().get(0).getKey());
// System.out.println(new ObjectMapper().writeValueAsString(r1));
// System.out.println(new ObjectMapper().writeValueAsString(r2));
}
private List<TypedField> prepareMocks(final String jsonFile) throws IOException, SQLException {

View File

@ -87,6 +87,7 @@
<oaf:language>und</oaf:language>
<oaf:concept id="https://zenodo.org/communities/epfl"/>
<oaf:hostedBy id="re3data_____::r3d100010468" name="Zenodo"/>
<oaf:projectid>corda_______::226852</oaf:projectid>
<oaf:collectedFrom id="re3data_____::r3d100010468" name="Zenodo"/>
</metadata>
<about xmlns:dc="http://purl.org/dc/elements/1.1/"

View File

@ -48,7 +48,7 @@ public class Scholix implements Serializable {
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s.setLinkprovider(
rel.getCollectedFrom().stream()
rel.getCollectedfrom().stream()
.map(
cf ->
new ScholixEntityId(
@ -73,7 +73,7 @@ public class Scholix implements Serializable {
if (scholixSummary.getDate() != null && scholixSummary.getDate().size() > 0)
s.setPublicationDate(scholixSummary.getDate().get(0));
s.setLinkprovider(
rel.getCollectedFrom().stream()
rel.getCollectedfrom().stream()
.map(
cf ->
new ScholixEntityId(

View File

@ -1 +1 @@
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedFrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]}
{"dataInfo":{"invisible":false,"inferred":null,"deletedbyinference":false,"trust":"0.9","inferenceprovenance":null,"provenanceaction":null},"lastupdatetimestamp":null,"relType":"IsReferencedBy","subRelType":null,"relClass":"datacite","source":"50|dedup_______::4f00e4f0e82bb4cbb35261478e55568e","target":"60|97519e00ee2cddfa1f5bcb5220429b8f","collectedfrom":[{"key":"dli_________::europe_pmc__","value":"Europe PMC","dataInfo":null}]}

View File

@ -11,11 +11,13 @@ import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
@ -104,16 +106,12 @@ public class CreateRelatedEntitiesJob_phase1 {
SparkSession spark,
String inputRelationsPath,
String inputEntityPath,
Class<E> entityClazz,
Class<E> clazz,
String outputPath) {
Dataset<Tuple2<String, SortableRelation>> relsByTarget =
readPathRelation(spark, inputRelationsPath)
.filter(
(FilterFunction<SortableRelation>)
value ->
value.getDataInfo().getDeletedbyinference()
== false)
.filter("dataInfo.deletedbyinference == false")
.map(
(MapFunction<SortableRelation, Tuple2<String, SortableRelation>>)
r -> new Tuple2<>(r.getTarget(), r),
@ -122,10 +120,11 @@ public class CreateRelatedEntitiesJob_phase1 {
.cache();
Dataset<Tuple2<String, RelatedEntity>> entities =
readPathEntity(spark, inputEntityPath, entityClazz)
readPathEntity(spark, inputEntityPath, clazz)
.filter("dataInfo.invisible == false")
.map(
(MapFunction<E, RelatedEntity>)
value -> asRelatedEntity(value, entityClazz),
value -> asRelatedEntity(value, clazz),
Encoders.bean(RelatedEntity.class))
.map(
(MapFunction<RelatedEntity, Tuple2<String, RelatedEntity>>)
@ -146,7 +145,7 @@ public class CreateRelatedEntitiesJob_phase1 {
Encoders.bean(EntityRelEntity.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath + "/" + EntityType.fromClass(entityClazz));
.parquet(outputPath + "/" + EntityType.fromClass(clazz));
}
private static <E extends OafEntity> Dataset<E> readPathEntity(
@ -161,6 +160,81 @@ public class CreateRelatedEntitiesJob_phase1 {
Encoders.bean(entityClazz));
}
public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) {
final RelatedEntity re = new RelatedEntity();
re.setId(entity.getId());
re.setType(EntityType.fromClass(clazz).name());
re.setPid(entity.getPid());
re.setCollectedfrom(entity.getCollectedfrom());
switch (EntityType.fromClass(clazz)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
Result result = (Result) entity;
if (result.getTitle() != null && !result.getTitle().isEmpty()) {
re.setTitle(result.getTitle().stream().findFirst().get());
}
re.setDateofacceptance(getValue(result.getDateofacceptance()));
re.setPublisher(getValue(result.getPublisher()));
re.setResulttype(result.getResulttype());
re.setInstances(result.getInstance());
// TODO still to be mapped
// re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
break;
case datasource:
Datasource d = (Datasource) entity;
re.setOfficialname(getValue(d.getOfficialname()));
re.setWebsiteurl(getValue(d.getWebsiteurl()));
re.setDatasourcetype(d.getDatasourcetype());
re.setOpenairecompatibility(d.getOpenairecompatibility());
break;
case organization:
Organization o = (Organization) entity;
re.setLegalname(getValue(o.getLegalname()));
re.setLegalshortname(getValue(o.getLegalshortname()));
re.setCountry(o.getCountry());
re.setWebsiteurl(getValue(o.getWebsiteurl()));
break;
case project:
Project p = (Project) entity;
re.setProjectTitle(getValue(p.getTitle()));
re.setCode(getValue(p.getCode()));
re.setAcronym(getValue(p.getAcronym()));
re.setContracttype(p.getContracttype());
List<Field<String>> f = p.getFundingtree();
if (!f.isEmpty()) {
re.setFundingtree(
f.stream().map(s -> s.getValue()).collect(Collectors.toList()));
}
break;
}
return re;
}
private static String getValue(Field<String> field) {
return getFieldValueWithDefault(field, "");
}
private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) {
return Optional.ofNullable(f)
.filter(Objects::nonNull)
.map(x -> x.getValue())
.orElse(defaultValue);
}
/**
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline
* delimited json text file,

View File

@ -76,9 +76,6 @@ public class PrepareRelationsJob {
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
int numPartitions = Integer.parseInt(parser.get("relPartitions"));
log.info("relPartitions: {}", numPartitions);
SparkConf conf = new SparkConf();
runWithSparkSession(
@ -86,16 +83,14 @@ public class PrepareRelationsJob {
isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
prepareRelationsFromPaths(spark, inputRelationsPath, outputPath);
});
}
private static void prepareRelationsFromPaths(
SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
SparkSession spark, String inputRelationsPath, String outputPath) {
readPathRelation(spark, inputRelationsPath)
.filter(
(FilterFunction<SortableRelation>)
value -> value.getDataInfo().getDeletedbyinference() == false)
.filter("dataInfo.deletedbyinference == false")
.groupByKey(
(MapFunction<SortableRelation, String>) value -> value.getSource(),
Encoders.STRING())
@ -103,7 +98,6 @@ public class PrepareRelationsJob {
(FlatMapGroupsFunction<String, SortableRelation, SortableRelation>)
(key, values) -> Iterators.limit(values, MAX_RELS),
Encoders.bean(SortableRelation.class))
.repartition(numPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);

View File

@ -3,14 +3,8 @@ package eu.dnetlib.dhp.oa.provision.utils;
import static org.apache.commons.lang3.StringUtils.substringAfter;
import com.google.common.collect.Sets;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.schema.common.EntityType;
import eu.dnetlib.dhp.schema.oaf.*;
import java.util.List;
import java.util.Objects;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;
public class GraphMappingUtils {
@ -18,81 +12,6 @@ public class GraphMappingUtils {
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) {
final RelatedEntity re = new RelatedEntity();
re.setId(entity.getId());
re.setType(EntityType.fromClass(clazz).name());
re.setPid(entity.getPid());
re.setCollectedfrom(entity.getCollectedfrom());
switch (EntityType.fromClass(clazz)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
Result result = (Result) entity;
if (result.getTitle() == null && !result.getTitle().isEmpty()) {
re.setTitle(result.getTitle().stream().findFirst().get());
}
re.setDateofacceptance(getValue(result.getDateofacceptance()));
re.setPublisher(getValue(result.getPublisher()));
re.setResulttype(result.getResulttype());
re.setInstances(result.getInstance());
// TODO still to be mapped
// re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
break;
case datasource:
Datasource d = (Datasource) entity;
re.setOfficialname(getValue(d.getOfficialname()));
re.setWebsiteurl(getValue(d.getWebsiteurl()));
re.setDatasourcetype(d.getDatasourcetype());
re.setOpenairecompatibility(d.getOpenairecompatibility());
break;
case organization:
Organization o = (Organization) entity;
re.setLegalname(getValue(o.getLegalname()));
re.setLegalshortname(getValue(o.getLegalshortname()));
re.setCountry(o.getCountry());
re.setWebsiteurl(getValue(o.getWebsiteurl()));
break;
case project:
Project p = (Project) entity;
re.setProjectTitle(getValue(p.getTitle()));
re.setCode(getValue(p.getCode()));
re.setAcronym(getValue(p.getAcronym()));
re.setContracttype(p.getContracttype());
List<Field<String>> f = p.getFundingtree();
if (!f.isEmpty()) {
re.setFundingtree(
f.stream().map(s -> s.getValue()).collect(Collectors.toList()));
}
break;
}
return re;
}
private static String getValue(Field<String> field) {
return getFieldValueWithDefault(field, "");
}
private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) {
return Optional.ofNullable(f)
.filter(Objects::nonNull)
.map(x -> x.getValue())
.orElse(defaultValue);
}
public static String removePrefix(final String s) {
if (s.contains("|")) return substringAfter(s, "|");
return s;

View File

@ -22,6 +22,7 @@ import java.io.IOException;
import java.io.Serializable;
import java.io.StringReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -41,6 +42,7 @@ import org.dom4j.io.XMLWriter;
public class XmlRecordFactory implements Serializable {
public static final String REL_SUBTYPE_DEDUP = "dedup";
private Map<String, LongAccumulator> accumulators;
private Set<String> specialDatasourceTypes;
@ -91,7 +93,14 @@ public class XmlRecordFactory implements Serializable {
// rels has to be processed before the contexts because they enrich the contextMap with
// the
// funding info.
final List<String> relations = listRelations(je, templateFactory, contexts);
final List<String> relations =
je.getLinks().stream()
.filter(
t ->
!REL_SUBTYPE_DEDUP.equalsIgnoreCase(
t.getRelation().getSubRelType()))
.map(link -> mapRelation(link, templateFactory, contexts))
.collect(Collectors.toCollection(ArrayList::new));
final String mainType = ModelSupport.getMainType(type);
metadata.addAll(buildContexts(mainType, contexts));
@ -102,7 +111,7 @@ public class XmlRecordFactory implements Serializable {
mainType,
metadata,
relations,
listChildren(entity, je.getEntity().getType(), templateFactory),
listChildren(entity, je, templateFactory),
listExtraInfo(entity));
return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
@ -919,171 +928,149 @@ public class XmlRecordFactory implements Serializable {
metadata.add(XmlSerializationUtils.mapQualifier("datasourcetypeui", dsType));
}
private Qualifier getBestAccessright(final Result r) {
Qualifier bestAccessRight = new Qualifier();
bestAccessRight.setClassid("UNKNOWN");
bestAccessRight.setClassname("not available");
bestAccessRight.setSchemeid("dnet:access_modes");
bestAccessRight.setSchemename("dnet:access_modes");
private String mapRelation(Tuple2 link, TemplateFactory templateFactory, Set<String> contexts) {
final Relation rel = link.getRelation();
final RelatedEntity re = link.getRelatedEntity();
final String targetType = link.getRelatedEntity().getType();
final LicenseComparator lc = new LicenseComparator();
for (final Instance instance : r.getInstance()) {
if (lc.compare(bestAccessRight, instance.getAccessright()) > 0) {
bestAccessRight = instance.getAccessright();
}
final List<String> metadata = Lists.newArrayList();
switch (EntityType.valueOf(targetType)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) {
metadata.add(
XmlSerializationUtils.mapStructuredProperty("title", re.getTitle()));
}
if (isNotBlank(re.getDateofacceptance())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"dateofacceptance", re.getDateofacceptance()));
}
if (isNotBlank(re.getPublisher())) {
metadata.add(
XmlSerializationUtils.asXmlElement("publisher", re.getPublisher()));
}
if (isNotBlank(re.getCodeRepositoryUrl())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"coderepositoryurl", re.getCodeRepositoryUrl()));
}
if (re.getResulttype() != null & re.getResulttype().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
}
if (re.getCollectedfrom() != null) {
metadata.addAll(
re.getCollectedfrom().stream()
.map(
kv ->
XmlSerializationUtils.mapKeyValue(
"collectedfrom", kv))
.collect(Collectors.toList()));
}
if (re.getPid() != null) {
metadata.addAll(
re.getPid().stream()
.map(p -> XmlSerializationUtils.mapStructuredProperty("pid", p))
.collect(Collectors.toList()));
}
break;
case datasource:
if (isNotBlank(re.getOfficialname())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"officialname", re.getOfficialname()));
}
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
mapDatasourceType(metadata, re.getDatasourcetype());
}
if (re.getOpenairecompatibility() != null
& !re.getOpenairecompatibility().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier(
"openairecompatibility", re.getOpenairecompatibility()));
}
break;
case organization:
if (isNotBlank(re.getLegalname())) {
metadata.add(
XmlSerializationUtils.asXmlElement("legalname", re.getLegalname()));
}
if (isNotBlank(re.getLegalshortname())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"legalshortname", re.getLegalshortname()));
}
if (re.getCountry() != null & !re.getCountry().isBlank()) {
metadata.add(XmlSerializationUtils.mapQualifier("country", re.getCountry()));
}
break;
case project:
if (isNotBlank(re.getProjectTitle())) {
metadata.add(XmlSerializationUtils.asXmlElement("title", re.getProjectTitle()));
}
if (isNotBlank(re.getCode())) {
metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode()));
}
if (isNotBlank(re.getAcronym())) {
metadata.add(XmlSerializationUtils.asXmlElement("acronym", re.getAcronym()));
}
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier(
"contracttype", re.getContracttype()));
}
if (re.getFundingtree() != null & contexts != null) {
metadata.addAll(
re.getFundingtree().stream()
.peek(ft -> fillContextMap(ft, contexts))
.map(ft -> getRelFundingTree(ft))
.collect(Collectors.toList()));
}
break;
default:
throw new IllegalArgumentException("invalid target type: " + targetType);
}
return bestAccessRight;
}
final DataInfo info = rel.getDataInfo();
final String scheme = ModelSupport.getScheme(re.getType(), targetType);
private List<String> listRelations(
final JoinedEntity je, TemplateFactory templateFactory, final Set<String> contexts) {
final List<String> rels = Lists.newArrayList();
for (final Tuple2 link : je.getLinks()) {
final Relation rel = link.getRelation();
final RelatedEntity re = link.getRelatedEntity();
final String targetType = link.getRelatedEntity().getType();
final List<String> metadata = Lists.newArrayList();
switch (EntityType.valueOf(targetType)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
if (re.getTitle() != null && isNotBlank(re.getTitle().getValue())) {
metadata.add(
XmlSerializationUtils.mapStructuredProperty(
"title", re.getTitle()));
}
if (isNotBlank(re.getDateofacceptance())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"dateofacceptance", re.getDateofacceptance()));
}
if (isNotBlank(re.getPublisher())) {
metadata.add(
XmlSerializationUtils.asXmlElement("publisher", re.getPublisher()));
}
if (isNotBlank(re.getCodeRepositoryUrl())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"coderepositoryurl", re.getCodeRepositoryUrl()));
}
if (re.getResulttype() != null & re.getResulttype().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier(
"resulttype", re.getResulttype()));
}
if (re.getCollectedfrom() != null) {
metadata.addAll(
re.getCollectedfrom().stream()
.map(
kv ->
XmlSerializationUtils.mapKeyValue(
"collectedfrom", kv))
.collect(Collectors.toList()));
}
if (re.getPid() != null) {
metadata.addAll(
re.getPid().stream()
.map(
p ->
XmlSerializationUtils.mapStructuredProperty(
"pid", p))
.collect(Collectors.toList()));
}
break;
case datasource:
if (isNotBlank(re.getOfficialname())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"officialname", re.getOfficialname()));
}
if (re.getDatasourcetype() != null & !re.getDatasourcetype().isBlank()) {
mapDatasourceType(metadata, re.getDatasourcetype());
}
if (re.getOpenairecompatibility() != null
& !re.getOpenairecompatibility().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier(
"openairecompatibility", re.getOpenairecompatibility()));
}
break;
case organization:
if (isNotBlank(re.getLegalname())) {
metadata.add(
XmlSerializationUtils.asXmlElement("legalname", re.getLegalname()));
}
if (isNotBlank(re.getLegalshortname())) {
metadata.add(
XmlSerializationUtils.asXmlElement(
"legalshortname", re.getLegalshortname()));
}
if (re.getCountry() != null & !re.getCountry().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier("country", re.getCountry()));
}
break;
case project:
if (isNotBlank(re.getProjectTitle())) {
metadata.add(
XmlSerializationUtils.asXmlElement("title", re.getProjectTitle()));
}
if (isNotBlank(re.getCode())) {
metadata.add(XmlSerializationUtils.asXmlElement("code", re.getCode()));
}
if (isNotBlank(re.getAcronym())) {
metadata.add(
XmlSerializationUtils.asXmlElement("acronym", re.getAcronym()));
}
if (re.getContracttype() != null & !re.getContracttype().isBlank()) {
metadata.add(
XmlSerializationUtils.mapQualifier(
"contracttype", re.getContracttype()));
}
if (re.getFundingtree() != null) {
metadata.addAll(
re.getFundingtree().stream()
.peek(ft -> fillContextMap(ft, contexts))
.map(ft -> getRelFundingTree(ft))
.collect(Collectors.toList()));
}
break;
default:
throw new IllegalArgumentException("invalid target type: " + targetType);
}
final DataInfo info = rel.getDataInfo();
final String scheme = ModelSupport.getScheme(re.getType(), targetType);
if (StringUtils.isBlank(scheme)) {
throw new IllegalArgumentException(
String.format("missing scheme for: <%s - %s>", re.getType(), targetType));
}
final String accumulatorName =
getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass());
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(1);
}
rels.add(
templateFactory.getRel(
targetType,
rel.getTarget(),
Sets.newHashSet(metadata),
rel.getRelClass(),
scheme,
info));
if (StringUtils.isBlank(scheme)) {
throw new IllegalArgumentException(
String.format("missing scheme for: <%s - %s>", re.getType(), targetType));
}
return rels;
final String accumulatorName =
getRelDescriptor(rel.getRelType(), rel.getSubRelType(), rel.getRelClass());
if (accumulators.containsKey(accumulatorName)) {
accumulators.get(accumulatorName).add(1);
}
return templateFactory.getRel(
targetType,
rel.getTarget(),
Sets.newHashSet(metadata),
rel.getRelClass(),
scheme,
info);
}
private List<String> listChildren(
final OafEntity entity, String type, TemplateFactory templateFactory) {
final OafEntity entity, JoinedEntity je, TemplateFactory templateFactory) {
final List<String> children = Lists.newArrayList();
EntityType entityType = EntityType.valueOf(type);
EntityType entityType = EntityType.valueOf(je.getEntity().getType());
children.addAll(
je.getLinks().stream()
.filter(
link ->
REL_SUBTYPE_DEDUP.equalsIgnoreCase(
link.getRelation().getSubRelType()))
.map(link -> mapRelation(link, templateFactory, null))
.collect(Collectors.toCollection(ArrayList::new)));
if (MainEntityType.result.toString().equals(ModelSupport.getMainType(entityType))) {
final List<Instance> instances = ((Result) entity).getInstance();
if (instances != null) {

View File

@ -98,6 +98,7 @@
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>

View File

@ -262,7 +262,8 @@
sparkDriverMemory,sparkExecutorMemory,sparkExecutorCores,
oozie.wf.application.path,projectVersion,oozie.use.system.libpath,
oozieActionShareLibForSpark1,spark1YarnHistoryServerAddress,spark1EventLogDir,
oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir
oozieActionShareLibForSpark2,spark2YarnHistoryServerAddress,spark2EventLogDir,
sparkSqlWarehouseDir
</include>
<includeSystemProperties>true</includeSystemProperties>
<includePropertyKeysFromFiles>