Merge branch 'provision_dataset'

This commit is contained in:
Claudio Atzori 2020-04-07 08:56:58 +02:00
commit c57cf679ca
37 changed files with 2320 additions and 987 deletions

View File

@ -13,6 +13,26 @@
<packaging>jar</packaging>
<dependencies>
<dependency>
<groupId>eu.dnetlib.dhp</groupId>
<artifactId>dhp-schemas</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>

View File

@ -0,0 +1,56 @@
package eu.dnetlib.dhp.common;
import java.io.Serializable;
import java.util.function.Supplier;
/**
* Provides serializable and throwing extensions to standard functional interfaces.
*/
public class FunctionalInterfaceSupport {
private FunctionalInterfaceSupport() {
}
/**
* Serializable supplier of any kind of objects. To be used withing spark processing pipelines when supplying
* functions externally.
*
* @param <T>
*/
@FunctionalInterface
public interface SerializableSupplier<T> extends Supplier<T>, Serializable {
}
/**
* Extension of consumer accepting functions throwing an exception.
*
* @param <T>
* @param <E>
*/
@FunctionalInterface
public interface ThrowingConsumer<T, E extends Exception> {
void accept(T t) throws E;
}
/**
* Extension of supplier accepting functions throwing an exception.
*
* @param <T>
* @param <E>
*/
@FunctionalInterface
public interface ThrowingSupplier<T, E extends Exception> {
T get() throws E;
}
/**
* Extension of runnable accepting functions throwing an exception.
*
* @param <E>
*/
@FunctionalInterface
public interface ThrowingRunnable<E extends Exception> {
void run() throws E;
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.common;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.ThrowingSupport.rethrowAsRuntimeException;
/**
* HDFS utility methods.
*/
public class HdfsSupport {
private static final Logger logger = LoggerFactory.getLogger(HdfsSupport.class);
private HdfsSupport() {
}
/**
* Removes a path (file or dir) from HDFS.
*
* @param path Path to be removed
* @param configuration Configuration of hadoop env
*/
public static void remove(String path, Configuration configuration) {
logger.info("Removing path: {}", path);
rethrowAsRuntimeException(() -> {
Path f = new Path(path);
FileSystem fileSystem = FileSystem.get(configuration);
if (fileSystem.exists(f)) {
fileSystem.delete(f, true);
}
});
}
/**
* Lists hadoop files located below path or alternatively lists subdirs under path.
*
* @param path Path to be listed for hadoop files
* @param configuration Configuration of hadoop env
* @return List with string locations of hadoop files
*/
public static List<String> listFiles(String path, Configuration configuration) {
logger.info("Listing files in path: {}", path);
return rethrowAsRuntimeException(() -> Arrays
.stream(FileSystem.get(configuration).listStatus(new Path(path)))
.filter(FileStatus::isDirectory)
.map(x -> x.getPath().toString())
.collect(Collectors.toList()));
}
}

View File

@ -0,0 +1,57 @@
package eu.dnetlib.dhp.common;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import java.util.Objects;
import java.util.function.Function;
/**
* SparkSession utility methods.
*/
public class SparkSessionSupport {
private SparkSessionSupport() {
}
/**
* Runs a given function using SparkSession created using default builder and supplied SparkConf. Stops SparkSession
* when SparkSession is managed. Allows to reuse SparkSession created externally.
*
* @param conf SparkConf instance
* @param isSparkSessionManaged When true will stop SparkSession
* @param fn Consumer to be applied to constructed SparkSession
*/
public static void runWithSparkSession(SparkConf conf,
Boolean isSparkSessionManaged,
ThrowingConsumer<SparkSession, Exception> fn) {
runWithSparkSession(c -> SparkSession.builder().config(c).getOrCreate(), conf, isSparkSessionManaged, fn);
}
/**
* Runs a given function using SparkSession created using supplied builder and supplied SparkConf. Stops SparkSession
* when SparkSession is managed. Allows to reuse SparkSession created externally.
*
* @param sparkSessionBuilder Builder of SparkSession
* @param conf SparkConf instance
* @param isSparkSessionManaged When true will stop SparkSession
* @param fn Consumer to be applied to constructed SparkSession
*/
public static void runWithSparkSession(Function<SparkConf, SparkSession> sparkSessionBuilder,
SparkConf conf,
Boolean isSparkSessionManaged,
ThrowingConsumer<SparkSession, Exception> fn) {
SparkSession spark = null;
try {
spark = sparkSessionBuilder.apply(conf);
fn.accept(spark);
} catch (Exception e) {
throw new RuntimeException(e);
} finally {
if (Objects.nonNull(spark) && isSparkSessionManaged) {
spark.stop();
}
}
}
}

View File

@ -0,0 +1,76 @@
package eu.dnetlib.dhp.common;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingRunnable;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingSupplier;
/**
* Exception handling utility methods.
*/
public class ThrowingSupport {
private ThrowingSupport() {
}
/**
* Executes given runnable and rethrows any exceptions as RuntimeException.
*
* @param fn Runnable to be executed
* @param <E> Type of exception thrown
*/
public static <E extends Exception> void rethrowAsRuntimeException(ThrowingRunnable<E> fn) {
try {
fn.run();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Executes given runnable and rethrows any exceptions as RuntimeException with custom message.
*
* @param fn Runnable to be executed
* @param msg Message to be set for rethrown exception
* @param <E> Type of exception thrown
*/
public static <E extends Exception> void rethrowAsRuntimeException(ThrowingRunnable<E> fn, String msg) {
try {
fn.run();
} catch (Exception e) {
throw new RuntimeException(msg, e);
}
}
/**
* Executes given supplier and rethrows any exceptions as RuntimeException.
*
* @param fn Supplier to be executed
* @param <T> Type of returned value
* @param <E> Type of exception thrown
* @return Result of supplier execution
*/
public static <T, E extends Exception> T rethrowAsRuntimeException(ThrowingSupplier<T, E> fn) {
try {
return fn.get();
} catch (Exception e) {
throw new RuntimeException(e);
}
}
/**
* Executes given supplier and rethrows any exceptions as RuntimeException with custom message.
*
* @param fn Supplier to be executed
* @param msg Message to be set for rethrown exception
* @param <T> Type of returned value
* @param <E> Type of exception thrown
* @return Result of supplier execution
*/
public static <T, E extends Exception> T rethrowAsRuntimeException(ThrowingSupplier<T, E> fn, String msg) {
try {
return fn.get();
} catch (Exception e) {
throw new RuntimeException(msg, e);
}
}
}

View File

@ -0,0 +1,78 @@
package eu.dnetlib.dhp.common;
import org.apache.hadoop.conf.Configuration;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import static org.junit.jupiter.api.Assertions.*;
public class HdfsSupportTest {
@Nested
class Remove {
@Test
public void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () ->
HdfsSupport.remove(null, new Configuration()));
}
@Test
public void shouldRemoveADirFromHDFS(@TempDir Path tempDir) {
// when
HdfsSupport.remove(tempDir.toString(), new Configuration());
// then
assertFalse(Files.exists(tempDir));
}
@Test
public void shouldRemoveAFileFromHDFS(@TempDir Path tempDir) throws IOException {
// given
Path file = Files.createTempFile(tempDir, "p", "s");
// when
HdfsSupport.remove(file.toString(), new Configuration());
// then
assertFalse(Files.exists(file));
}
}
@Nested
class ListFiles {
@Test
public void shouldThrowARuntimeExceptionOnError() {
// when
assertThrows(RuntimeException.class, () ->
HdfsSupport.listFiles(null, new Configuration()));
}
@Test
public void shouldListFilesLocatedInPath(@TempDir Path tempDir) throws IOException {
Path subDir1 = Files.createTempDirectory(tempDir, "list_me");
Path subDir2 = Files.createTempDirectory(tempDir, "list_me");
// when
List<String> paths = HdfsSupport.listFiles(tempDir.toString(), new Configuration());
// then
assertEquals(2, paths.size());
List<String> expecteds = Arrays.stream(new String[]{subDir1.toString(), subDir2.toString()})
.sorted().collect(Collectors.toList());
List<String> actuals = paths.stream().sorted().collect(Collectors.toList());
assertTrue(actuals.get(0).contains(expecteds.get(0)));
assertTrue(actuals.get(1).contains(expecteds.get(1)));
}
}
}

View File

@ -0,0 +1,36 @@
package eu.dnetlib.dhp.common;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import eu.dnetlib.dhp.schema.oaf.Relation;
import eu.dnetlib.dhp.schema.oaf.Result;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertTrue;
public class ModelSupportTest {
@Nested
class IsSubClass {
@Test
public void shouldReturnFalseWhenSubClassDoesNotExtendSuperClass() {
// when
Boolean result = ModelSupport.isSubClass(Relation.class, OafEntity.class);
// then
assertFalse(result);
}
@Test
public void shouldReturnTrueWhenSubClassExtendsSuperClass() {
// when
Boolean result = ModelSupport.isSubClass(Result.class, OafEntity.class);
// then
assertTrue(result);
}
}
}

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.common;
import eu.dnetlib.dhp.common.FunctionalInterfaceSupport.ThrowingConsumer;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import java.util.function.Function;
import static org.mockito.Mockito.*;
public class SparkSessionSupportTest {
@Nested
class RunWithSparkSession {
@Test
public void shouldExecuteFunctionAndNotStopSparkSessionWhenSparkSessionIsNotManaged() throws Exception {
// given
SparkSession spark = mock(SparkSession.class);
SparkConf conf = mock(SparkConf.class);
Function<SparkConf, SparkSession> sparkSessionBuilder = mock(Function.class);
when(sparkSessionBuilder.apply(conf)).thenReturn(spark);
ThrowingConsumer<SparkSession, Exception> fn = mock(ThrowingConsumer.class);
// when
SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, false, fn);
// then
verify(sparkSessionBuilder).apply(conf);
verify(fn).accept(spark);
verify(spark, never()).stop();
}
@Test
public void shouldExecuteFunctionAndStopSparkSessionWhenSparkSessionIsManaged() throws Exception {
// given
SparkSession spark = mock(SparkSession.class);
SparkConf conf = mock(SparkConf.class);
Function<SparkConf, SparkSession> sparkSessionBuilder = mock(Function.class);
when(sparkSessionBuilder.apply(conf)).thenReturn(spark);
ThrowingConsumer<SparkSession, Exception> fn = mock(ThrowingConsumer.class);
// when
SparkSessionSupport.runWithSparkSession(sparkSessionBuilder, conf, true, fn);
// then
verify(sparkSessionBuilder).apply(conf);
verify(fn).accept(spark);
verify(spark, times(1)).stop();
}
}
}

View File

@ -0,0 +1,51 @@
package eu.dnetlib.dhp.schema.common;
import eu.dnetlib.dhp.schema.oaf.Oaf;
/**
* Inheritance utility methods.
*/
public class ModelSupport {
private ModelSupport() {
}
/**
* Checks subclass-superclass relationship.
*
* @param subClazzObject Subclass object instance
* @param superClazzObject Superclass object instance
* @param <X> Subclass type
* @param <Y> Superclass type
* @return True if X is a subclass of Y
*/
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(X subClazzObject, Y superClazzObject) {
return isSubClass(subClazzObject.getClass(), superClazzObject.getClass());
}
/**
* Checks subclass-superclass relationship.
*
* @param subClazzObject Subclass object instance
* @param superClazz Superclass class
* @param <X> Subclass type
* @param <Y> Superclass type
* @return True if X is a subclass of Y
*/
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(X subClazzObject, Class<Y> superClazz) {
return isSubClass(subClazzObject.getClass(), superClazz);
}
/**
* Checks subclass-superclass relationship.
*
* @param subClazz Subclass class
* @param superClazz Superclass class
* @param <X> Subclass type
* @param <Y> Superclass type
* @return True if X is a subclass of Y
*/
public static <X extends Oaf, Y extends Oaf> Boolean isSubClass(Class<X> subClazz, Class<Y> superClazz) {
return superClazz.isAssignableFrom(subClazz);
}
}

View File

@ -93,8 +93,7 @@ public class Relation extends Oaf {
subRelType.equals(relation.subRelType) &&
relClass.equals(relation.relClass) &&
source.equals(relation.source) &&
target.equals(relation.target) &&
Objects.equals(collectedFrom, relation.collectedFrom);
target.equals(relation.target);
}
@Override

View File

@ -0,0 +1,123 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.Tuple2;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.MapGroupsFunction;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
* by E, finally grouped by E.id;
*
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
* 1) PrepareRelationsJob:
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
* can be linked at most to 100 other objects
*
* 2) JoinRelationEntityByTargetJob:
* (phase 1): prepare tuples [relation - target entity] (R - T):
* for each entity type E_i
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
* join (R.target = T_i.id)
* save the tuples (R_i, T_i)
* (phase 2):
* create the union of all the entity types E, hash by id
* read the tuples (R, T), hash by R.source
* join E.id = (R, T).source, where E becomes the Source Entity S
* save the tuples (S, R, T)
*
* 3) AdjacencyListBuilderJob:
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
*
* 4) XmlConverterJob:
* convert the JoinedEntities as XML records
*/
public class AdjacencyListBuilderJob {
private static final Logger log = LoggerFactory.getLogger(AdjacencyListBuilderJob.class);
public static final int MAX_LINKS = 100;
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
AdjacencyListBuilderJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(getKryoClasses());
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
createAdjacencyLists(spark, inputPath, outputPath);
});
}
private static void createAdjacencyLists(SparkSession spark, String inputPath, String outputPath) {
log.info("Reading joined entities from: {}", inputPath);
spark.read()
.load(inputPath)
.as(Encoders.bean(EntityRelEntity.class))
.groupByKey((MapFunction<EntityRelEntity, String>) value -> value.getEntity().getId(), Encoders.STRING())
.mapGroups((MapGroupsFunction<String, EntityRelEntity, JoinedEntity>) (key, values) -> {
JoinedEntity j = new JoinedEntity();
List<Tuple2> links = new ArrayList<>();
while (values.hasNext() && links.size() < MAX_LINKS) {
EntityRelEntity curr = values.next();
if (j.getEntity() == null) {
j.setEntity(curr.getEntity());
}
links.add(new Tuple2(curr.getRelation(), curr.getTarget()));
}
j.setLinks(links);
return j;
}, Encoders.bean(JoinedEntity.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -0,0 +1,156 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import org.apache.commons.io.IOUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.*;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
* by E, finally grouped by E.id;
*
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
* 1) PrepareRelationsJob:
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
* can be linked at most to 100 other objects
*
* 2) JoinRelationEntityByTargetJob:
* (phase 1): prepare tuples [relation - target entity] (R - T):
* for each entity type E_i
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
* join (R.target = T_i.id)
* save the tuples (R_i, T_i)
* (phase 2):
* create the union of all the entity types E, hash by id
* read the tuples (R, T), hash by R.source
* join E.id = (R, T).source, where E becomes the Source Entity S
* save the tuples (S, R, T)
*
* 3) AdjacencyListBuilderJob:
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
*
* 4) XmlConverterJob:
* convert the JoinedEntities as XML records
*/
public class CreateRelatedEntitiesJob_phase1 {
private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase1.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils.toString(
PrepareRelationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase1.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputRelationsPath = parser.get("inputRelationsPath");
log.info("inputRelationsPath: {}", inputRelationsPath);
String inputEntityPath = parser.get("inputEntityPath");
log.info("inputEntityPath: {}", inputEntityPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String graphTableClassName = parser.get("graphTableClassName");
log.info("graphTableClassName: {}", graphTableClassName);
Class<? extends OafEntity> entityClazz = (Class<? extends OafEntity>) Class.forName(graphTableClassName);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(getKryoClasses());
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
joinRelationEntity(spark, inputRelationsPath, inputEntityPath, entityClazz, outputPath);
});
}
private static <E extends OafEntity> void joinRelationEntity(SparkSession spark, String inputRelationsPath, String inputEntityPath, Class<E> entityClazz, String outputPath) {
Dataset<Tuple2<String, SortableRelation>> relsByTarget = readPathRelation(spark, inputRelationsPath)
.filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
.map((MapFunction<SortableRelation, Tuple2<String, SortableRelation>>) r -> new Tuple2<>(r.getTarget(), r),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(SortableRelation.class)))
.cache();
Dataset<Tuple2<String, RelatedEntity>> entities = readPathEntity(spark, inputEntityPath, entityClazz)
.map((MapFunction<E, RelatedEntity>) value -> asRelatedEntity(value, entityClazz), Encoders.bean(RelatedEntity.class))
.map((MapFunction<RelatedEntity, Tuple2<String, RelatedEntity>>) e -> new Tuple2<>(e.getId(), e),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(RelatedEntity.class)))
.cache();
relsByTarget
.joinWith(entities, entities.col("_1").equalTo(relsByTarget.col("_1")), "inner")
.map((MapFunction<Tuple2<Tuple2<String, SortableRelation>, Tuple2<String, RelatedEntity>>, EntityRelEntity>)
t -> new EntityRelEntity(t._1()._2(), t._2()._2()),
Encoders.bean(EntityRelEntity.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath + "/" + EntityType.fromClass(entityClazz));
}
private static <E extends OafEntity> Dataset<E> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
.read()
.textFile(inputEntityPath)
.map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz));
}
/**
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
*
* @param spark
* @param relationPath
* @return the Dataset<SortableRelation> containing all the relationships
*/
private static Dataset<SortableRelation> readPathRelation(SparkSession spark, final String relationPath) {
log.info("Reading relations from: {}", relationPath);
return spark.read()
.load(relationPath)
.as(Encoders.bean(SortableRelation.class));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -0,0 +1,194 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
import eu.dnetlib.dhp.oa.provision.model.TypedRow;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.FilterFunction;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import scala.collection.JavaConverters;
import scala.collection.Seq;
import java.util.List;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.getKryoClasses;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
* by E, finally grouped by E.id;
*
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
* 1) PrepareRelationsJob:
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
* can be linked at most to 100 other objects
*
* 2) JoinRelationEntityByTargetJob:
* (phase 1): prepare tuples [relation - target entity] (R - T):
* for each entity type E_i
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
* join (R.target = T_i.id)
* save the tuples (R_i, T_i)
* (phase 2):
* create the union of all the entity types E, hash by id
* read the tuples (R, T), hash by R.source
* join E.id = (R, T).source, where E becomes the Source Entity S
* save the tuples (S, R, T)
*
* 3) AdjacencyListBuilderJob:
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
*
* 4) XmlConverterJob:
* convert the JoinedEntities as XML records
*/
public class CreateRelatedEntitiesJob_phase2 {
private static final Logger log = LoggerFactory.getLogger(CreateRelatedEntitiesJob_phase2.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils.toString(
PrepareRelationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_related_entities_pahase2.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputRelatedEntitiesPath = parser.get("inputRelatedEntitiesPath");
log.info("inputRelatedEntitiesPath: {}", inputRelatedEntitiesPath);
String inputGraphRootPath = parser.get("inputGraphRootPath");
log.info("inputGraphRootPath: {}", inputGraphRootPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
int numPartitions = Integer.parseInt(parser.get("numPartitions"));
log.info("numPartitions: {}", numPartitions);
SparkConf conf = new SparkConf();
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
conf.registerKryoClasses(getKryoClasses());
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
joinAllEntities(spark, inputRelatedEntitiesPath, inputGraphRootPath, outputPath, numPartitions);
});
}
private static void joinAllEntities(SparkSession spark, String inputRelatedEntitiesPath, String inputGraphRootPath, String outputPath, int numPartitions) {
Dataset<Tuple2<String, TypedRow>> entities = readAllEntities(spark, inputGraphRootPath, numPartitions);
Dataset<Tuple2<String, EntityRelEntity>> relsBySource = readRelatedEntities(spark, inputRelatedEntitiesPath);
entities
.joinWith(relsBySource, entities.col("_1").equalTo(relsBySource.col("_1")), "left_outer")
.map((MapFunction<Tuple2<Tuple2<String, TypedRow>, Tuple2<String, EntityRelEntity>>, EntityRelEntity>) value -> {
EntityRelEntity re = new EntityRelEntity();
re.setEntity(value._1()._2());
Optional<EntityRelEntity> related = Optional.ofNullable(value._2()).map(Tuple2::_2);
if (related.isPresent()) {
re.setRelation(related.get().getRelation());
re.setTarget(related.get().getTarget());
}
return re;
}, Encoders.bean(EntityRelEntity.class))
.repartition(numPartitions)
.filter((FilterFunction<EntityRelEntity>) value -> value.getEntity() != null && StringUtils.isNotBlank(value.getEntity().getId()))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static Dataset<Tuple2<String, TypedRow>> readAllEntities(SparkSession spark, String inputGraphPath, int numPartitions) {
Dataset<TypedRow> publication = readPathEntity(spark, inputGraphPath + "/publication", Publication.class);
Dataset<TypedRow> dataset = readPathEntity(spark, inputGraphPath + "/dataset", eu.dnetlib.dhp.schema.oaf.Dataset.class);
Dataset<TypedRow> other = readPathEntity(spark, inputGraphPath + "/otherresearchproduct", OtherResearchProduct.class);
Dataset<TypedRow> software = readPathEntity(spark, inputGraphPath + "/software", Software.class);
Dataset<TypedRow> datasource = readPathEntity(spark, inputGraphPath + "/datasource", Datasource.class);
Dataset<TypedRow> organization = readPathEntity(spark, inputGraphPath + "/organization", Organization.class);
Dataset<TypedRow> project = readPathEntity(spark, inputGraphPath + "/project", Project.class);
return publication
.union(dataset)
.union(other)
.union(software)
.union(datasource)
.union(organization)
.union(project)
.map((MapFunction<TypedRow, Tuple2<String, TypedRow>>)
value -> new Tuple2<>(value.getId(), value),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(TypedRow.class)))
.repartition(numPartitions);
}
private static Dataset<Tuple2<String, EntityRelEntity>> readRelatedEntities(SparkSession spark, String inputRelatedEntitiesPath) {
log.info("Reading related entities from: {}", inputRelatedEntitiesPath);
final List<String> paths = HdfsSupport.listFiles(inputRelatedEntitiesPath, spark.sparkContext().hadoopConfiguration());
log.info("Found paths: {}", String.join(",", paths));
return spark.read()
.load(toSeq(paths))
.as(Encoders.bean(EntityRelEntity.class))
.map((MapFunction<EntityRelEntity, Tuple2<String, EntityRelEntity>>)
value -> new Tuple2<>(value.getRelation().getSource(), value),
Encoders.tuple(Encoders.STRING(), Encoders.kryo(EntityRelEntity.class)));
}
private static <E extends OafEntity> Dataset<TypedRow> readPathEntity(SparkSession spark, String inputEntityPath, Class<E> entityClazz) {
log.info("Reading Graph table from: {}", inputEntityPath);
return spark
.read()
.textFile(inputEntityPath)
.map((MapFunction<String, E>) value -> OBJECT_MAPPER.readValue(value, entityClazz), Encoders.bean(entityClazz))
.map((MapFunction<E, TypedRow>) value -> getTypedRow(StringUtils.substringAfterLast(inputEntityPath, "/"), value), Encoders.bean(TypedRow.class));
}
private static TypedRow getTypedRow(String type, OafEntity entity) throws JsonProcessingException {
TypedRow t = new TypedRow();
t.setType(type);
t.setDeleted(entity.getDataInfo().getDeletedbyinference());
t.setId(entity.getId());
t.setOaf(OBJECT_MAPPER.writeValueAsString(entity));
return t;
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static Seq<String> toSeq(List<String> list) {
return JavaConverters.asScalaIteratorConverter(list.iterator()).asScala().toSeq();
}
}

View File

@ -1,291 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.DeserializationFeature;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.oa.provision.model.*;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import scala.Tuple2;
import java.io.IOException;
import java.io.Serializable;
import java.util.Map;
import static eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils.asRelatedEntity;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented creating the union between the entity types (E), joined by the relationships (R), and again
* by E, finally grouped by E.id;
*
* Different manipulations of the E and R sets are introduced to reduce the complexity of the operation
* 1) treat the object payload as string, extracting only the necessary information beforehand using json path,
* it seems that deserializing it with jackson's object mapper has higher memory footprint.
*
* 2) only consider rels that are not virtually deleted ($.dataInfo.deletedbyinference == false)
* 3) we only need a subset of fields from the related entities, so we introduce a distinction between E_source = S
* and E_target = T. Objects in T are heavily pruned by all the unnecessary information
*
* 4) perform the join as (((T.id join R.target) union S) groupby S.id) yield S -> [ <T, R> ]
*/
public class GraphJoiner implements Serializable {
private Map<String, LongAccumulator> accumulators = Maps.newHashMap();
public static final int MAX_RELS = 100;
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
private SparkSession spark;
private ContextMapper contextMapper;
private String inputPath;
private String outPath;
private String otherDsTypeId;
public GraphJoiner(SparkSession spark, ContextMapper contextMapper, String otherDsTypeId, String inputPath, String outPath) {
this.spark = spark;
this.contextMapper = contextMapper;
this.otherDsTypeId = otherDsTypeId;
this.inputPath = inputPath;
this.outPath = outPath;
final SparkContext sc = spark.sparkContext();
prepareAccumulators(sc);
}
public GraphJoiner adjacencyLists() {
final JavaSparkContext jsc = new JavaSparkContext(getSpark().sparkContext());
// read each entity
JavaPairRDD<String, TypedRow> datasource = readPathEntity(jsc, getInputPath(), "datasource");
JavaPairRDD<String, TypedRow> organization = readPathEntity(jsc, getInputPath(), "organization");
JavaPairRDD<String, TypedRow> project = readPathEntity(jsc, getInputPath(), "project");
JavaPairRDD<String, TypedRow> dataset = readPathEntity(jsc, getInputPath(), "dataset");
JavaPairRDD<String, TypedRow> otherresearchproduct = readPathEntity(jsc, getInputPath(), "otherresearchproduct");
JavaPairRDD<String, TypedRow> software = readPathEntity(jsc, getInputPath(), "software");
JavaPairRDD<String, TypedRow> publication = readPathEntity(jsc, getInputPath(), "publication");
// create the union between all the entities
final String entitiesPath = getOutPath() + "/entities";
datasource
.union(organization)
.union(project)
.union(dataset)
.union(otherresearchproduct)
.union(software)
.union(publication)
.map(e -> new EntityRelEntity().setSource(e._2()))
.map(GraphMappingUtils::serialize)
.saveAsTextFile(entitiesPath, GzipCodec.class);
JavaPairRDD<String, EntityRelEntity> entities = jsc.textFile(entitiesPath)
.map(t -> new ObjectMapper().readValue(t, EntityRelEntity.class))
.mapToPair(t -> new Tuple2<>(t.getSource().getSourceId(), t));
final String relationPath = getOutPath() + "/relation";
// reads the relationships
final JavaPairRDD<SortableRelationKey, EntityRelEntity> rels = readPathRelation(jsc, getInputPath())
.filter(rel -> !rel.getDeleted()) //only consider those that are not virtually deleted
.map(p -> new EntityRelEntity().setRelation(p))
.mapToPair(p -> new Tuple2<>(SortableRelationKey.from(p), p));
rels
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
.map(p -> Iterables.limit(p._2(), MAX_RELS))
.flatMap(p -> p.iterator())
.map(s -> new ObjectMapper().writeValueAsString(s))
.saveAsTextFile(relationPath, GzipCodec.class);
final JavaPairRDD<String, EntityRelEntity> relation = jsc.textFile(relationPath)
.map(s -> new ObjectMapper().readValue(s, EntityRelEntity.class))
.mapToPair(p -> new Tuple2<>(p.getRelation().getTargetId(), p));
final String bySourcePath = getOutPath() + "/join_by_source";
relation
.join(entities
.filter(e -> !e._2().getSource().getDeleted())
.mapToPair(e -> new Tuple2<>(e._1(), asRelatedEntity(e._2()))))
.map(s -> new EntityRelEntity()
.setRelation(s._2()._1().getRelation())
.setTarget(s._2()._2().getSource()))
.map(j -> new ObjectMapper().writeValueAsString(j))
.saveAsTextFile(bySourcePath, GzipCodec.class);
JavaPairRDD<String, EntityRelEntity> bySource = jsc.textFile(bySourcePath)
.map(e -> getObjectMapper().readValue(e, EntityRelEntity.class))
.mapToPair(t -> new Tuple2<>(t.getRelation().getSourceId(), t));
final XmlRecordFactory recordFactory = new XmlRecordFactory(accumulators, contextMapper, false, schemaLocation, otherDsTypeId);
entities
.union(bySource)
.groupByKey() // by source id
.map(l -> toJoinedEntity(l))
.mapToPair(je -> new Tuple2<>(
new Text(je.getEntity().getId()),
new Text(recordFactory.build(je))))
.saveAsHadoopFile(getOutPath() + "/xml", Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
return this;
}
public SparkSession getSpark() {
return spark;
}
public String getInputPath() {
return inputPath;
}
public String getOutPath() {
return outPath;
}
// HELPERS
private OafEntity parseOaf(final String json, final String type, final ObjectMapper mapper) {
try {
switch (GraphMappingUtils.EntityType.valueOf(type)) {
case publication:
return mapper.readValue(json, Publication.class);
case dataset:
return mapper.readValue(json, Dataset.class);
case otherresearchproduct:
return mapper.readValue(json, OtherResearchProduct.class);
case software:
return mapper.readValue(json, Software.class);
case datasource:
return mapper.readValue(json, Datasource.class);
case organization:
return mapper.readValue(json, Organization.class);
case project:
return mapper.readValue(json, Project.class);
default:
throw new IllegalArgumentException("invalid type: " + type);
}
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
private JoinedEntity toJoinedEntity(Tuple2<String, Iterable<EntityRelEntity>> p) {
final ObjectMapper mapper = getObjectMapper();
final JoinedEntity j = new JoinedEntity();
final Links links = new Links();
for(EntityRelEntity rel : p._2()) {
if (rel.hasMainEntity() & j.getEntity() == null) {
j.setType(rel.getSource().getType());
j.setEntity(parseOaf(rel.getSource().getOaf(), rel.getSource().getType(), mapper));
}
if (rel.hasRelatedEntity()) {
try {
links.add(
new eu.dnetlib.dhp.oa.provision.model.Tuple2()
.setRelation(mapper.readValue(rel.getRelation().getOaf(), Relation.class))
.setRelatedEntity(mapper.readValue(rel.getTarget().getOaf(), RelatedEntity.class)));
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}
j.setLinks(links);
if (j.getEntity() == null) {
throw new IllegalStateException("missing main entity on '" + p._1() + "'");
}
return j;
}
/**
* Reads a set of eu.dnetlib.dhp.schema.oaf.OafEntity objects from a sequence file <className, entity json serialization>,
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
* @param sc
* @param inputPath
* @param type
* @return the JavaPairRDD<String, TypedRow> indexed by entity identifier
*/
private JavaPairRDD<String, TypedRow> readPathEntity(final JavaSparkContext sc, final String inputPath, final String type) {
return sc.textFile(inputPath + "/" + type)
.mapToPair((PairFunction<String, String, TypedRow>) s -> {
final DocumentContext json = JsonPath.parse(s);
final String id = json.read("$.id");
return new Tuple2<>(id, new TypedRow()
.setSourceId(id)
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
.setType(type)
.setOaf(s));
});
}
/**
* Reads a set of eu.dnetlib.dhp.schema.oaf.Relation objects from a sequence file <className, relation json serialization>,
* extracts necessary information using json path, wraps the oaf object in a eu.dnetlib.dhp.graph.model.TypedRow
* @param sc
* @param inputPath
* @return the JavaRDD<TypedRow> containing all the relationships
*/
private JavaRDD<TypedRow> readPathRelation(final JavaSparkContext sc, final String inputPath) {
return sc.textFile(inputPath + "/relation")
.map(s -> {
final DocumentContext json = JsonPath.parse(s);
return new TypedRow()
.setSourceId(json.read("$.source"))
.setTargetId(json.read("$.target"))
.setDeleted(json.read("$.dataInfo.deletedbyinference"))
.setType("relation")
.setRelType("$.relType")
.setSubRelType("$.subRelType")
.setRelClass("$.relClass")
.setOaf(s);
});
}
private ObjectMapper getObjectMapper() {
return new ObjectMapper().configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false);
}
private void prepareAccumulators(SparkContext sc) {
accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
}
}

View File

@ -0,0 +1,151 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Iterables;
import com.google.common.collect.Iterators;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.oa.provision.utils.RelationPartitioner;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.*;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.util.Optional;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The operation is implemented by sequentially joining one entity type at time (E) with the relationships (R), and again
* by E, finally grouped by E.id;
*
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
* 1) PrepareRelationsJob:
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
* can be linked at most to 100 other objects
*
* 2) JoinRelationEntityByTargetJob:
* (phase 1): prepare tuples [relation - target entity] (R - T):
* for each entity type E_i
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
* join (R.target = T_i.id)
* save the tuples (R_i, T_i)
* (phase 2):
* create the union of all the entity types E, hash by id
* read the tuples (R, T), hash by R.source
* join E.id = (R, T).source, where E becomes the Source Entity S
* save the tuples (S, R, T)
*
* 3) AdjacencyListBuilderJob:
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
*
* 4) XmlConverterJob:
* convert the JoinedEntities as XML records
*/
public class PrepareRelationsJob {
private static final Logger log = LoggerFactory.getLogger(PrepareRelationsJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final int MAX_RELS = 100;
public static void main(String[] args) throws Exception {
String jsonConfiguration = IOUtils.toString(
PrepareRelationsJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_prepare_relations.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputRelationsPath = parser.get("inputRelationsPath");
log.info("inputRelationsPath: {}", inputRelationsPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
int numPartitions = Integer.parseInt(parser.get("relPartitions"));
log.info("relPartitions: {}", numPartitions);
SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
prepareRelationsFromPaths(spark, inputRelationsPath, outputPath, numPartitions);
});
}
private static void prepareRelationsFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
readPathRelation(spark, inputRelationsPath)
.filter((FilterFunction<SortableRelation>) value -> value.getDataInfo().getDeletedbyinference() == false)
.groupByKey((MapFunction<SortableRelation, String>) value -> value.getSource(), Encoders.STRING())
.flatMapGroups((FlatMapGroupsFunction<String, SortableRelation, SortableRelation>) (key, values) -> Iterators.limit(values, MAX_RELS), Encoders.bean(SortableRelation.class))
.repartition(numPartitions)
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
/**
* Reads a Dataset of eu.dnetlib.dhp.oa.provision.model.SortableRelation objects from a newline delimited json text file,
*
* @param spark
* @param inputPath
* @return the Dataset<SortableRelation> containing all the relationships
*/
private static Dataset<SortableRelation> readPathRelation(SparkSession spark, final String inputPath) {
return spark.read()
.textFile(inputPath)
.map((MapFunction<String, SortableRelation>) value -> OBJECT_MAPPER.readValue(value, SortableRelation.class), Encoders.bean(SortableRelation.class));
}
private static void prepareRelationsRDDFromPaths(SparkSession spark, String inputRelationsPath, String outputPath, int numPartitions) {
JavaRDD<SortableRelation> rels = readPathRelationRDD(spark, inputRelationsPath)
.repartition(numPartitions);
RDD<SortableRelation> d = rels
.filter(rel -> !rel.getDataInfo().getDeletedbyinference()) //only consider those that are not virtually deleted
.mapToPair((PairFunction<SortableRelation, SortableRelation, SortableRelation>) rel -> new Tuple2<>(rel, rel))
.groupByKey(new RelationPartitioner(rels.getNumPartitions()))
.map(p -> Iterables.limit(p._2(), MAX_RELS))
.flatMap(p -> p.iterator())
.rdd();
spark.createDataset(d, Encoders.bean(SortableRelation.class))
.write()
.mode(SaveMode.Overwrite)
.parquet(outputPath);
}
private static JavaRDD<SortableRelation> readPathRelationRDD(SparkSession spark, final String inputPath) {
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
return sc.textFile(inputPath)
.map(s -> OBJECT_MAPPER.readValue(s, SortableRelation.class));
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
}

View File

@ -1,47 +0,0 @@
package eu.dnetlib.dhp.oa.provision;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.spark.SparkConf;
import org.apache.spark.sql.SparkSession;
public class SparkXmlRecordBuilderJob {
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkXmlRecordBuilderJob.class.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_build_adjacency_lists.json")));
parser.parseArgument(args);
final String master = parser.get("master");
final SparkConf conf = new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
try(SparkSession spark = getSession(conf, master)) {
final String inputPath = parser.get("sourcePath");
final String outputPath = parser.get("outputPath");
final String isLookupUrl = parser.get("isLookupUrl");
final String otherDsTypeId = parser.get("otherDsTypeId");
final FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
new GraphJoiner(spark, ContextMapper.fromIS(isLookupUrl), otherDsTypeId, inputPath, outputPath)
.adjacencyLists();
}
}
private static SparkSession getSession(SparkConf conf, String master) {
return SparkSession
.builder()
.config(conf)
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
.master(master)
.getOrCreate();
}
}

View File

@ -0,0 +1,166 @@
package eu.dnetlib.dhp.oa.provision;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.provision.model.*;
import eu.dnetlib.dhp.oa.provision.utils.ContextMapper;
import eu.dnetlib.dhp.oa.provision.utils.GraphMappingUtils;
import eu.dnetlib.dhp.oa.provision.utils.XmlRecordFactory;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.io.IOUtils;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.spark.SparkConf;
import org.apache.spark.SparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.sql.SparkSession;
import org.apache.spark.util.LongAccumulator;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import scala.Tuple2;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.stream.Collectors;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
/**
* Joins the graph nodes by resolving the links of distance = 1 to create an adjacency list of linked objects.
* The operation considers all the entity types (publication, dataset, software, ORP, project, datasource, organization,
* and all the possible relationships (similarity links produced by the Dedup process are excluded).
*
* The workflow is organized in different parts aimed to to reduce the complexity of the operation
* 1) PrepareRelationsJob:
* only consider relationships that are not virtually deleted ($.dataInfo.deletedbyinference == false), each entity
* can be linked at most to 100 other objects
*
* 2) JoinRelationEntityByTargetJob:
* (phase 1): prepare tuples [relation - target entity] (R - T):
* for each entity type E_i
* map E_i as RelatedEntity T_i to simplify the model and extracting only the necessary information
* join (R.target = T_i.id)
* save the tuples (R_i, T_i)
* (phase 2):
* create the union of all the entity types E, hash by id
* read the tuples (R, T), hash by R.source
* join E.id = (R, T).source, where E becomes the Source Entity S
* save the tuples (S, R, T)
*
* 3) AdjacencyListBuilderJob:
* given the tuple (S - R - T) we need to group by S.id -> List [ R - T ], mapping the result as JoinedEntity
*
* 4) XmlConverterJob:
* convert the JoinedEntities as XML records
*/
public class XmlConverterJob {
private static final Logger log = LoggerFactory.getLogger(XmlConverterJob.class);
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public static final String schemaLocation = "https://www.openaire.eu/schema/1.0/oaf-1.0.xsd";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
XmlConverterJob.class
.getResourceAsStream("/eu/dnetlib/dhp/oa/provision/input_params_xml_converter.json")));
parser.parseArgument(args);
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
String outputPath = parser.get("outputPath");
log.info("outputPath: {}", outputPath);
String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
String otherDsTypeId = parser.get("otherDsTypeId");
log.info("otherDsTypeId: {}", otherDsTypeId);
SparkConf conf = new SparkConf();
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
removeOutputDir(spark, outputPath);
convertToXml(spark, inputPath, outputPath, ContextMapper.fromIS(isLookupUrl), otherDsTypeId);
});
}
private static void convertToXml(SparkSession spark, String inputPath, String outputPath, ContextMapper contextMapper, String otherDsTypeId) {
final XmlRecordFactory recordFactory = new XmlRecordFactory(prepareAccumulators(spark.sparkContext()), contextMapper, false, schemaLocation, otherDsTypeId);
spark.read()
.load(inputPath)
.as(Encoders.bean(JoinedEntity.class))
.map((MapFunction<JoinedEntity, JoinedEntity>) j -> {
if (j.getLinks() != null) {
j.setLinks(j.getLinks()
.stream()
.filter(t -> t.getRelation() != null & t.getRelatedEntity() != null)
.collect(Collectors.toCollection(ArrayList::new)));
}
return j;
}, Encoders.bean(JoinedEntity.class))
.map((MapFunction<JoinedEntity, Tuple2<String, String>>) je -> new Tuple2<>(
je.getEntity().getId(),
recordFactory.build(je)
), Encoders.tuple(Encoders.STRING(), Encoders.STRING()))
.javaRDD()
.mapToPair((PairFunction<Tuple2<String, String>, Text, Text>) t -> new Tuple2<>(new Text(t._1()), new Text(t._2())))
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
}
private static void removeOutputDir(SparkSession spark, String path) {
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
}
private static Map<String, LongAccumulator> prepareAccumulators(SparkContext sc) {
Map<String, LongAccumulator> accumulators = Maps.newHashMap();
accumulators.put("resultResult_similarity_isAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_isAmongTopNSimilarDocuments"));
accumulators.put("resultResult_similarity_hasAmongTopNSimilarDocuments", sc.longAccumulator("resultResult_similarity_hasAmongTopNSimilarDocuments"));
accumulators.put("resultResult_supplement_isSupplementTo", sc.longAccumulator("resultResult_supplement_isSupplementTo"));
accumulators.put("resultResult_supplement_isSupplementedBy", sc.longAccumulator("resultResult_supplement_isSupplementedBy"));
accumulators.put("resultResult_dedup_isMergedIn", sc.longAccumulator("resultResult_dedup_isMergedIn"));
accumulators.put("resultResult_dedup_merges", sc.longAccumulator("resultResult_dedup_merges"));
accumulators.put("resultResult_publicationDataset_isRelatedTo", sc.longAccumulator("resultResult_publicationDataset_isRelatedTo"));
accumulators.put("resultResult_relationship_isRelatedTo", sc.longAccumulator("resultResult_relationship_isRelatedTo"));
accumulators.put("resultProject_outcome_isProducedBy", sc.longAccumulator("resultProject_outcome_isProducedBy"));
accumulators.put("resultProject_outcome_produces", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("resultOrganization_affiliation_isAuthorInstitutionOf", sc.longAccumulator("resultOrganization_affiliation_isAuthorInstitutionOf"));
accumulators.put("resultOrganization_affiliation_hasAuthorInstitution", sc.longAccumulator("resultOrganization_affiliation_hasAuthorInstitution"));
accumulators.put("projectOrganization_participation_hasParticipant", sc.longAccumulator("projectOrganization_participation_hasParticipant"));
accumulators.put("projectOrganization_participation_isParticipant", sc.longAccumulator("projectOrganization_participation_isParticipant"));
accumulators.put("organizationOrganization_dedup_isMergedIn", sc.longAccumulator("organizationOrganization_dedup_isMergedIn"));
accumulators.put("organizationOrganization_dedup_merges", sc.longAccumulator("resultProject_outcome_produces"));
accumulators.put("datasourceOrganization_provision_isProvidedBy", sc.longAccumulator("datasourceOrganization_provision_isProvidedBy"));
accumulators.put("datasourceOrganization_provision_provides", sc.longAccumulator("datasourceOrganization_provision_provides"));
return accumulators;
}
}

View File

@ -10,14 +10,13 @@ import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.solr.common.SolrInputDocument;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.rdd.RDD;
import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
@ -28,47 +27,69 @@ import java.io.StringReader;
import java.io.StringWriter;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Optional;
public class SparkXmlIndexingJob {
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
private static final Log log = LogFactory.getLog(SparkXmlIndexingJob.class);
public class XmlIndexingJob {
private static final Logger log = LoggerFactory.getLogger(XmlIndexingJob.class);
private static final Integer DEFAULT_BATCH_SIZE = 1000;
private static final String LAYOUT = "index";
private static final String INTERPRETATION = "openaire";
private static final String SEPARATOR = "-";
public static final String DATE_FORMAT = "yyyy-MM-dd'T'hh:mm:ss'Z'";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils.toString(
SparkXmlIndexingJob.class.getResourceAsStream(
XmlIndexingJob.class.getResourceAsStream(
"/eu/dnetlib/dhp/oa/provision/input_params_update_index.json")));
parser.parseArgument(args);
final String inputPath = parser.get("sourcePath");
Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged"))
.map(Boolean::valueOf)
.orElse(Boolean.TRUE);
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
final String inputPath = parser.get("inputPath");
log.info("inputPath: {}", inputPath);
final String isLookupUrl = parser.get("isLookupUrl");
log.info("isLookupUrl: {}", isLookupUrl);
final String format = parser.get("format");
log.info("format: {}", format);
final Integer batchSize = parser.getObjectMap().containsKey("batchSize") ? Integer.valueOf(parser.get("batchSize")) : DEFAULT_BATCH_SIZE;
log.info("batchSize: {}", batchSize);
final ISLookUpService isLookup = ISLookupClientFactory.getLookUpService(isLookupUrl);
final String fields = getLayoutSource(isLookup, format);
log.info("fields: {}", fields);
final String xslt = getLayoutTransformer(isLookup);
final String dsId = getDsId(format, isLookup);
log.info("dsId: {}", dsId);
final String zkHost = getZkHost(isLookup);
log.info("zkHost: {}", zkHost);
final String version = getRecordDatestamp();
final String indexRecordXslt = getLayoutTransformer(format, fields, xslt);
log.info("indexRecordTransformer {}", indexRecordXslt);
log.info("indexRecordTransformer: " + indexRecordXslt);
final SparkConf conf = new SparkConf();
final String master = parser.get("master");
final SparkConf conf = new SparkConf()
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
try(SparkSession spark = getSession(conf, master)) {
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
runWithSparkSession(conf, isSparkSessionManaged,
spark -> {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
RDD<SolrInputDocument> docs = sc.sequenceFile(inputPath, Text.class, Text.class)
.map(t -> t._2().toString())
@ -76,17 +97,9 @@ public class SparkXmlIndexingJob {
.map(s -> new StreamingInputDocumentFactory(version, dsId).parseDocument(s))
.rdd();
SolrSupport.indexDocs(zkHost, format + "-" + LAYOUT + "-openaire", batchSize, docs);
}
}
private static SparkSession getSession(SparkConf conf, String master) {
return SparkSession
.builder()
.config(conf)
.appName(SparkXmlRecordBuilderJob.class.getSimpleName())
.master(master)
.getOrCreate();
final String collection = format + SEPARATOR + LAYOUT + SEPARATOR + INTERPRETATION;
SolrSupport.indexDocs(zkHost, collection, batchSize, docs);
});
}
private static String toIndexRecord(Transformer tr, final String record) {
@ -95,7 +108,7 @@ public class SparkXmlIndexingJob {
tr.transform(new StreamSource(new StringReader(record)), res);
return res.getWriter().toString();
} catch (Throwable e) {
System.out.println("XPathException on record:\n" + record);
log.error("XPathException on record: \n {}", record, e);
throw new IllegalArgumentException(e);
}
}
@ -127,7 +140,7 @@ public class SparkXmlIndexingJob {
* @return the parsed date
*/
public static String getRecordDatestamp() {
return new SimpleDateFormat("yyyy-MM-dd'T'hh:mm:ss'Z'").format(new Date());
return new SimpleDateFormat(DATE_FORMAT).format(new Date());
}
/**

View File

@ -1,54 +1,64 @@
package eu.dnetlib.dhp.oa.provision.model;
import com.google.common.base.Objects;
import java.io.Serializable;
public class EntityRelEntity implements Serializable {
private TypedRow source;
private TypedRow relation;
private TypedRow target;
private TypedRow entity;
private SortableRelation relation;
private RelatedEntity target;
public EntityRelEntity() {
}
public EntityRelEntity(TypedRow source) {
this.source = source;
public EntityRelEntity(SortableRelation relation, RelatedEntity target) {
this(null, relation, target);
}
//helpers
public Boolean hasMainEntity() {
return getSource() != null & getRelation() == null & getTarget() == null;
public EntityRelEntity(TypedRow entity, SortableRelation relation, RelatedEntity target) {
this.entity = entity;
this.relation = relation;
this.target = target;
}
public Boolean hasRelatedEntity() {
return getSource() == null & getRelation() != null & getTarget() != null;
public TypedRow getEntity() {
return entity;
}
public TypedRow getSource() {
return source;
public void setEntity(TypedRow entity) {
this.entity = entity;
}
public EntityRelEntity setSource(TypedRow source) {
this.source = source;
return this;
}
public TypedRow getRelation() {
public SortableRelation getRelation() {
return relation;
}
public EntityRelEntity setRelation(TypedRow relation) {
public void setRelation(SortableRelation relation) {
this.relation = relation;
return this;
}
public TypedRow getTarget() {
public RelatedEntity getTarget() {
return target;
}
public EntityRelEntity setTarget(TypedRow target) {
public void setTarget(RelatedEntity target) {
this.target = target;
return this;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
EntityRelEntity that = (EntityRelEntity) o;
return Objects.equal(entity, that.entity) &&
Objects.equal(relation, that.relation) &&
Objects.equal(target, that.target);
}
@Override
public int hashCode() {
return Objects.hashCode(entity, relation, target);
}
}

View File

@ -1,41 +1,30 @@
package eu.dnetlib.dhp.oa.provision.model;
import eu.dnetlib.dhp.schema.oaf.OafEntity;
import java.io.Serializable;
import java.util.List;
public class JoinedEntity implements Serializable {
private String type;
private TypedRow entity;
private OafEntity entity;
private List<Tuple2> links;
private Links links;
public String getType() {
return type;
public JoinedEntity() {
}
public JoinedEntity setType(String type) {
this.type = type;
return this;
}
public OafEntity getEntity() {
public TypedRow getEntity() {
return entity;
}
public JoinedEntity setEntity(OafEntity entity) {
public void setEntity(TypedRow entity) {
this.entity = entity;
return this;
}
public Links getLinks() {
public List<Tuple2> getLinks() {
return links;
}
public JoinedEntity setLinks(Links links) {
public void setLinks(List<Tuple2> links) {
this.links = links;
return this;
}
}

View File

@ -1,6 +0,0 @@
package eu.dnetlib.dhp.oa.provision.model;
import java.util.ArrayList;
public class Links extends ArrayList<Tuple2> {
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.oa.provision.model;
import com.google.common.base.Objects;
import eu.dnetlib.dhp.schema.oaf.Instance;
import eu.dnetlib.dhp.schema.oaf.KeyValue;
import eu.dnetlib.dhp.schema.oaf.Qualifier;
@ -49,207 +50,218 @@ public class RelatedEntity implements Serializable {
return id;
}
public RelatedEntity setId(String id) {
public void setId(String id) {
this.id = id;
return this;
}
public StructuredProperty getTitle() {
return title;
}
public RelatedEntity setTitle(StructuredProperty title) {
this.title = title;
return this;
}
public String getDateofacceptance() {
return dateofacceptance;
}
public RelatedEntity setDateofacceptance(String dateofacceptance) {
this.dateofacceptance = dateofacceptance;
return this;
}
public String getPublisher() {
return publisher;
}
public RelatedEntity setPublisher(String publisher) {
this.publisher = publisher;
return this;
}
public List<StructuredProperty> getPid() {
return pid;
}
public RelatedEntity setPid(List<StructuredProperty> pid) {
this.pid = pid;
return this;
}
public String getCodeRepositoryUrl() {
return codeRepositoryUrl;
}
public RelatedEntity setCodeRepositoryUrl(String codeRepositoryUrl) {
this.codeRepositoryUrl = codeRepositoryUrl;
return this;
}
public Qualifier getResulttype() {
return resulttype;
}
public RelatedEntity setResulttype(Qualifier resulttype) {
this.resulttype = resulttype;
return this;
}
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public RelatedEntity setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
return this;
}
public List<Instance> getInstances() {
return instances;
}
public RelatedEntity setInstances(List<Instance> instances) {
this.instances = instances;
return this;
}
public String getOfficialname() {
return officialname;
}
public RelatedEntity setOfficialname(String officialname) {
this.officialname = officialname;
return this;
}
public String getWebsiteurl() {
return websiteurl;
}
public RelatedEntity setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
return this;
}
public Qualifier getDatasourcetype() {
return datasourcetype;
}
public RelatedEntity setDatasourcetype(Qualifier datasourcetype) {
this.datasourcetype = datasourcetype;
return this;
}
public Qualifier getDatasourcetypeui() {
return datasourcetypeui;
}
public RelatedEntity setDatasourcetypeui(Qualifier datasourcetypeui) {
this.datasourcetypeui = datasourcetypeui;
return this;
}
public Qualifier getOpenairecompatibility() {
return openairecompatibility;
}
public RelatedEntity setOpenairecompatibility(Qualifier openairecompatibility) {
this.openairecompatibility = openairecompatibility;
return this;
}
public String getLegalname() {
return legalname;
}
public RelatedEntity setLegalname(String legalname) {
this.legalname = legalname;
return this;
}
public String getLegalshortname() {
return legalshortname;
}
public RelatedEntity setLegalshortname(String legalshortname) {
this.legalshortname = legalshortname;
return this;
}
public Qualifier getCountry() {
return country;
}
public RelatedEntity setCountry(Qualifier country) {
this.country = country;
return this;
}
public String getCode() {
return code;
}
public RelatedEntity setCode(String code) {
this.code = code;
return this;
}
public String getAcronym() {
return acronym;
}
public RelatedEntity setAcronym(String acronym) {
this.acronym = acronym;
return this;
}
public Qualifier getContracttype() {
return contracttype;
}
public RelatedEntity setContracttype(Qualifier contracttype) {
this.contracttype = contracttype;
return this;
}
public List<String> getFundingtree() {
return fundingtree;
}
public RelatedEntity setFundingtree(List<String> fundingtree) {
this.fundingtree = fundingtree;
return this;
}
public String getProjectTitle() {
return projectTitle;
}
public RelatedEntity setProjectTitle(String projectTitle) {
this.projectTitle = projectTitle;
return this;
}
public String getType() {
return type;
}
public RelatedEntity setType(String type) {
public void setType(String type) {
this.type = type;
return this;
}
public StructuredProperty getTitle() {
return title;
}
public void setTitle(StructuredProperty title) {
this.title = title;
}
public String getWebsiteurl() {
return websiteurl;
}
public void setWebsiteurl(String websiteurl) {
this.websiteurl = websiteurl;
}
public String getDateofacceptance() {
return dateofacceptance;
}
public void setDateofacceptance(String dateofacceptance) {
this.dateofacceptance = dateofacceptance;
}
public String getPublisher() {
return publisher;
}
public void setPublisher(String publisher) {
this.publisher = publisher;
}
public List<StructuredProperty> getPid() {
return pid;
}
public void setPid(List<StructuredProperty> pid) {
this.pid = pid;
}
public String getCodeRepositoryUrl() {
return codeRepositoryUrl;
}
public void setCodeRepositoryUrl(String codeRepositoryUrl) {
this.codeRepositoryUrl = codeRepositoryUrl;
}
public Qualifier getResulttype() {
return resulttype;
}
public void setResulttype(Qualifier resulttype) {
this.resulttype = resulttype;
}
public List<KeyValue> getCollectedfrom() {
return collectedfrom;
}
public void setCollectedfrom(List<KeyValue> collectedfrom) {
this.collectedfrom = collectedfrom;
}
public List<Instance> getInstances() {
return instances;
}
public void setInstances(List<Instance> instances) {
this.instances = instances;
}
public String getOfficialname() {
return officialname;
}
public void setOfficialname(String officialname) {
this.officialname = officialname;
}
public Qualifier getDatasourcetype() {
return datasourcetype;
}
public void setDatasourcetype(Qualifier datasourcetype) {
this.datasourcetype = datasourcetype;
}
public Qualifier getDatasourcetypeui() {
return datasourcetypeui;
}
public void setDatasourcetypeui(Qualifier datasourcetypeui) {
this.datasourcetypeui = datasourcetypeui;
}
public Qualifier getOpenairecompatibility() {
return openairecompatibility;
}
public void setOpenairecompatibility(Qualifier openairecompatibility) {
this.openairecompatibility = openairecompatibility;
}
public String getLegalname() {
return legalname;
}
public void setLegalname(String legalname) {
this.legalname = legalname;
}
public String getLegalshortname() {
return legalshortname;
}
public void setLegalshortname(String legalshortname) {
this.legalshortname = legalshortname;
}
public Qualifier getCountry() {
return country;
}
public void setCountry(Qualifier country) {
this.country = country;
}
public String getProjectTitle() {
return projectTitle;
}
public void setProjectTitle(String projectTitle) {
this.projectTitle = projectTitle;
}
public String getCode() {
return code;
}
public void setCode(String code) {
this.code = code;
}
public String getAcronym() {
return acronym;
}
public void setAcronym(String acronym) {
this.acronym = acronym;
}
public Qualifier getContracttype() {
return contracttype;
}
public void setContracttype(Qualifier contracttype) {
this.contracttype = contracttype;
}
public List<String> getFundingtree() {
return fundingtree;
}
public void setFundingtree(List<String> fundingtree) {
this.fundingtree = fundingtree;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
RelatedEntity that = (RelatedEntity) o;
return Objects.equal(id, that.id) &&
Objects.equal(type, that.type) &&
Objects.equal(title, that.title) &&
Objects.equal(websiteurl, that.websiteurl) &&
Objects.equal(dateofacceptance, that.dateofacceptance) &&
Objects.equal(publisher, that.publisher) &&
Objects.equal(pid, that.pid) &&
Objects.equal(codeRepositoryUrl, that.codeRepositoryUrl) &&
Objects.equal(resulttype, that.resulttype) &&
Objects.equal(collectedfrom, that.collectedfrom) &&
Objects.equal(instances, that.instances) &&
Objects.equal(officialname, that.officialname) &&
Objects.equal(datasourcetype, that.datasourcetype) &&
Objects.equal(datasourcetypeui, that.datasourcetypeui) &&
Objects.equal(openairecompatibility, that.openairecompatibility) &&
Objects.equal(legalname, that.legalname) &&
Objects.equal(legalshortname, that.legalshortname) &&
Objects.equal(country, that.country) &&
Objects.equal(projectTitle, that.projectTitle) &&
Objects.equal(code, that.code) &&
Objects.equal(acronym, that.acronym) &&
Objects.equal(contracttype, that.contracttype) &&
Objects.equal(fundingtree, that.fundingtree);
}
@Override
public int hashCode() {
return Objects.hashCode(id, type, title, websiteurl, dateofacceptance, publisher, pid, codeRepositoryUrl, resulttype, collectedfrom, instances, officialname, datasourcetype, datasourcetypeui, openairecompatibility, legalname, legalshortname, country, projectTitle, code, acronym, contracttype, fundingtree);
}
}

View File

@ -0,0 +1,35 @@
package eu.dnetlib.dhp.oa.provision.model;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import eu.dnetlib.dhp.schema.oaf.Relation;
import java.io.Serializable;
import java.util.Map;
public class SortableRelation extends Relation implements Comparable<Relation>, Serializable {
private final static Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("publicationDataset", 2);
weights.put("relationship", 3);
weights.put("similarity", 4);
weights.put("affiliation", 5);
weights.put("provision", 6);
weights.put("participation", 7);
weights.put("dedup", 8);
}
@Override
public int compareTo(Relation o) {
return ComparisonChain.start()
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
.compare(getSource(), o.getSource())
.compare(getTarget(), o.getTarget())
.result();
}
}

View File

@ -1,99 +0,0 @@
package eu.dnetlib.dhp.oa.provision.model;
import com.google.common.collect.ComparisonChain;
import com.google.common.collect.Maps;
import java.io.Serializable;
import java.util.Map;
/**
* Allows to sort relationships according to the priority defined in weights map.
*/
public class SortableRelationKey implements Comparable<SortableRelationKey>, Serializable {
private String sourceId;
private String targetId;
private String relType;
private String subRelType;
private String relClass;
private final static Map<String, Integer> weights = Maps.newHashMap();
static {
weights.put("outcome", 0);
weights.put("supplement", 1);
weights.put("publicationDataset", 2);
weights.put("relationship", 3);
weights.put("similarity", 4);
weights.put("affiliation", 5);
weights.put("provision", 6);
weights.put("participation", 7);
weights.put("dedup", 8);
}
public static SortableRelationKey from(final EntityRelEntity e) {
return new SortableRelationKey()
.setSourceId(e.getRelation().getSourceId())
.setTargetId(e.getRelation().getTargetId())
.setRelType(e.getRelation().getRelType())
.setSubRelType(e.getRelation().getSubRelType())
.setRelClass(e.getRelation().getRelClass());
}
public String getSourceId() {
return sourceId;
}
public SortableRelationKey setSourceId(String sourceId) {
this.sourceId = sourceId;
return this;
}
public String getTargetId() {
return targetId;
}
public SortableRelationKey setTargetId(String targetId) {
this.targetId = targetId;
return this;
}
public String getRelType() {
return relType;
}
public SortableRelationKey setRelType(String relType) {
this.relType = relType;
return this;
}
public String getSubRelType() {
return subRelType;
}
public SortableRelationKey setSubRelType(String subRelType) {
this.subRelType = subRelType;
return this;
}
public String getRelClass() {
return relClass;
}
public SortableRelationKey setRelClass(String relClass) {
this.relClass = relClass;
return this;
}
@Override
public int compareTo(SortableRelationKey o) {
return ComparisonChain.start()
.compare(weights.get(getSubRelType()), weights.get(o.getSubRelType()))
.compare(getSourceId(), o.getSourceId())
.compare(getTargetId(), o.getTargetId())
.result();
}
}

View File

@ -2,27 +2,50 @@ package eu.dnetlib.dhp.oa.provision.model;
import eu.dnetlib.dhp.schema.oaf.Relation;
public class Tuple2 {
import java.io.Serializable;
import java.util.Objects;
public class Tuple2 implements Serializable {
private Relation relation;
private RelatedEntity relatedEntity;
public Tuple2() {
}
public Tuple2(Relation relation, RelatedEntity relatedEntity) {
this.relation = relation;
this.relatedEntity = relatedEntity;
}
public Relation getRelation() {
return relation;
}
public Tuple2 setRelation(Relation relation) {
public void setRelation(Relation relation) {
this.relation = relation;
return this;
}
public RelatedEntity getRelatedEntity() {
return relatedEntity;
}
public Tuple2 setRelatedEntity(RelatedEntity relatedEntity) {
public void setRelatedEntity(RelatedEntity relatedEntity) {
this.relatedEntity = relatedEntity;
return this;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
Tuple2 t2 = (Tuple2) o;
return getRelation().equals(t2.getRelation());
}
@Override
public int hashCode() {
return Objects.hash(getRelation().hashCode());
}
}

View File

@ -1,92 +1,61 @@
package eu.dnetlib.dhp.oa.provision.model;
import com.google.common.base.Objects;
import java.io.Serializable;
public class TypedRow implements Serializable {
private String sourceId;
private String targetId;
private String id;
private Boolean deleted;
private String type;
private String relType;
private String subRelType;
private String relClass;
private String oaf;
public String getSourceId() {
return sourceId;
public String getId() {
return id;
}
public TypedRow setSourceId(String sourceId) {
this.sourceId = sourceId;
return this;
}
public String getTargetId() {
return targetId;
}
public TypedRow setTargetId(String targetId) {
this.targetId = targetId;
return this;
public void setId(String id) {
this.id = id;
}
public Boolean getDeleted() {
return deleted;
}
public TypedRow setDeleted(Boolean deleted) {
public void setDeleted(Boolean deleted) {
this.deleted = deleted;
return this;
}
public String getType() {
return type;
}
public TypedRow setType(String type) {
public void setType(String type) {
this.type = type;
return this;
}
public String getRelType() {
return relType;
}
public TypedRow setRelType(String relType) {
this.relType = relType;
return this;
}
public String getSubRelType() {
return subRelType;
}
public TypedRow setSubRelType(String subRelType) {
this.subRelType = subRelType;
return this;
}
public String getRelClass() {
return relClass;
}
public TypedRow setRelClass(String relClass) {
this.relClass = relClass;
return this;
}
public String getOaf() {
return oaf;
}
public TypedRow setOaf(String oaf) {
public void setOaf(String oaf) {
this.oaf = oaf;
return this;
}
@Override
public boolean equals(Object o) {
if (this == o) return true;
if (o == null || getClass() != o.getClass()) return false;
TypedRow typedRow2 = (TypedRow) o;
return Objects.equal(id, typedRow2.id);
}
@Override
public int hashCode() {
return Objects.hashCode(id);
}
}

View File

@ -1,33 +1,47 @@
package eu.dnetlib.dhp.oa.provision.utils;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Predicate;
import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import eu.dnetlib.dhp.oa.provision.model.EntityRelEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.TypedRow;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import eu.dnetlib.dhp.schema.oaf.*;
import net.minidev.json.JSONArray;
import org.apache.commons.lang3.StringUtils;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.*;
import java.util.stream.Collectors;
import static org.apache.commons.lang3.StringUtils.*;
import static org.apache.commons.lang3.StringUtils.substringAfter;
public class GraphMappingUtils {
public static final String SEPARATOR = "_";
public final static Map<EntityType, Class> entityTypes = Maps.newHashMap();
static {
entityTypes.put(EntityType.datasource, Datasource.class);
entityTypes.put(EntityType.organization, Organization.class);
entityTypes.put(EntityType.project, Project.class);
entityTypes.put(EntityType.dataset, Dataset.class);
entityTypes.put(EntityType.otherresearchproduct, OtherResearchProduct.class);
entityTypes.put(EntityType.software, Software.class);
entityTypes.put(EntityType.publication, Publication.class);
}
public enum EntityType {
publication, dataset, otherresearchproduct, software, datasource, organization, project
publication, dataset, otherresearchproduct, software, datasource, organization, project;
public static <T extends OafEntity> EntityType fromClass(Class<T> clazz) {
switch (clazz.getName()) {
case "eu.dnetlib.dhp.schema.oaf.Publication" : return publication;
case "eu.dnetlib.dhp.schema.oaf.Dataset" : return dataset;
case "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct" : return otherresearchproduct;
case "eu.dnetlib.dhp.schema.oaf.Software" : return software;
case "eu.dnetlib.dhp.schema.oaf.Datasource" : return datasource;
case "eu.dnetlib.dhp.schema.oaf.Organization" : return organization;
case "eu.dnetlib.dhp.schema.oaf.Project" : return project;
default: throw new IllegalArgumentException("Unknown OafEntity class: " + clazz.getName());
}
}
}
public enum MainEntityType {
@ -36,8 +50,6 @@ public class GraphMappingUtils {
public static Set<String> authorPidTypes = Sets.newHashSet("orcid", "magidentifier");
public static Set<String> instanceFieldFilter = Sets.newHashSet("instancetype", "hostedby", "license", "accessright", "collectedfrom", "dateofacceptance", "distributionlocation");
private static final String schemeTemplate = "dnet:%s_%s_relations";
private static Map<EntityType, MainEntityType> entityMapping = Maps.newHashMap();
@ -52,169 +64,127 @@ public class GraphMappingUtils {
entityMapping.put(EntityType.project, MainEntityType.project);
}
public static Class[] getKryoClasses() {
return new Class[]{
Author.class,
Context.class,
Country.class,
DataInfo.class,
eu.dnetlib.dhp.schema.oaf.Dataset.class,
Datasource.class,
ExternalReference.class,
ExtraInfo.class,
Field.class,
GeoLocation.class,
Instance.class,
Journal.class,
KeyValue.class,
Oaf.class,
OafEntity.class,
OAIProvenance.class,
Organization.class,
OriginDescription.class,
OtherResearchProduct.class,
Project.class,
Publication.class,
Qualifier.class,
Relation.class,
SortableRelation.class, //SUPPORT
Result.class,
Software.class,
StructuredProperty.class
};
}
public static String getScheme(final String sourceType, final String targetType) {
return String.format(schemeTemplate,
entityMapping.get(EntityType.valueOf(sourceType)).name(),
entityMapping.get(EntityType.valueOf(targetType)).name());
}
public static String getMainType(final String type) {
return entityMapping.get(EntityType.valueOf(type)).name();
public static String getMainType(final EntityType type) {
return entityMapping.get(type).name();
}
public static boolean isResult(String type) {
public static boolean isResult(EntityType type) {
return MainEntityType.result.name().equals(getMainType(type));
}
public static Predicate<String> instanceFilter = s -> instanceFieldFilter.contains(s);
public static <E extends OafEntity> RelatedEntity asRelatedEntity(E entity, Class<E> clazz) {
public static EntityRelEntity asRelatedEntity(EntityRelEntity e) {
final RelatedEntity re = new RelatedEntity();
re.setId(entity.getId());
re.setType(EntityType.fromClass(clazz).name());
final DocumentContext j = JsonPath.parse(e.getSource().getOaf());
final RelatedEntity re = new RelatedEntity().setId(j.read("$.id")).setType(e.getSource().getType());
re.setPid(entity.getPid());
re.setCollectedfrom(entity.getCollectedfrom());
switch (EntityType.valueOf(e.getSource().getType())) {
switch (GraphMappingUtils.EntityType.fromClass(clazz)) {
case publication:
case dataset:
case otherresearchproduct:
case software:
mapTitle(j, re);
re.setDateofacceptance(j.read("$.dateofacceptance.value"));
re.setPublisher(j.read("$.publisher.value"));
JSONArray pids = j.read("$.pid");
re.setPid(pids.stream()
.map(p -> asStructuredProperty((LinkedHashMap<String, Object>) p))
.collect(Collectors.toList()));
Result result = (Result) entity;
re.setResulttype(asQualifier(j.read("$.resulttype")));
if (result.getTitle() == null && !result.getTitle().isEmpty()) {
re.setTitle(result.getTitle().stream().findFirst().get());
}
JSONArray collfrom = j.read("$.collectedfrom");
re.setCollectedfrom(collfrom.stream()
.map(c -> asKV((LinkedHashMap<String, Object>) c))
.collect(Collectors.toList()));
// will throw exception when the instance is not found
JSONArray instances = j.read("$.instance");
re.setInstances(instances.stream()
.map(i -> {
final LinkedHashMap<String, Object> p = (LinkedHashMap<String, Object>) i;
final Field<String> license = new Field<String>();
license.setValue((String) ((LinkedHashMap<String, Object>) p.get("license")).get("value"));
final Instance instance = new Instance();
instance.setLicense(license);
instance.setAccessright(asQualifier((LinkedHashMap<String, String>) p.get("accessright")));
instance.setInstancetype(asQualifier((LinkedHashMap<String, String>) p.get("instancetype")));
instance.setHostedby(asKV((LinkedHashMap<String, Object>) p.get("hostedby")));
//TODO mapping of distributionlocation
instance.setCollectedfrom(asKV((LinkedHashMap<String, Object>) p.get("collectedfrom")));
Field<String> dateofacceptance = new Field<String>();
dateofacceptance.setValue((String) ((LinkedHashMap<String, Object>) p.get("dateofacceptance")).get("value"));
instance.setDateofacceptance(dateofacceptance);
return instance;
}).collect(Collectors.toList()));
re.setDateofacceptance(getValue(result.getDateofacceptance()));
re.setPublisher(getValue(result.getPublisher()));
re.setResulttype(result.getResulttype());
re.setInstances(result.getInstance());
//TODO still to be mapped
//re.setCodeRepositoryUrl(j.read("$.coderepositoryurl"));
break;
case datasource:
re.setOfficialname(j.read("$.officialname.value"));
re.setWebsiteurl(j.read("$.websiteurl.value"));
re.setDatasourcetype(asQualifier(j.read("$.datasourcetype")));
re.setOpenairecompatibility(asQualifier(j.read("$.openairecompatibility")));
Datasource d = (Datasource) entity;
re.setOfficialname(getValue(d.getOfficialname()));
re.setWebsiteurl(getValue(d.getWebsiteurl()));
re.setDatasourcetype(d.getDatasourcetype());
re.setOpenairecompatibility(d.getOpenairecompatibility());
break;
case organization:
re.setLegalname(j.read("$.legalname.value"));
re.setLegalshortname(j.read("$.legalshortname.value"));
re.setCountry(asQualifier(j.read("$.country")));
re.setWebsiteurl(j.read("$.websiteurl.value"));
Organization o = (Organization) entity;
re.setLegalname(getValue(o.getLegalname()));
re.setLegalshortname(getValue(o.getLegalshortname()));
re.setCountry(o.getCountry());
re.setWebsiteurl(getValue(o.getWebsiteurl()));
break;
case project:
re.setProjectTitle(j.read("$.title.value"));
re.setCode(j.read("$.code.value"));
re.setAcronym(j.read("$.acronym.value"));
re.setContracttype(asQualifier(j.read("$.contracttype")));
Project p = (Project) entity;
JSONArray f = j.read("$.fundingtree");
re.setProjectTitle(getValue(p.getTitle()));
re.setCode(getValue(p.getCode()));
re.setAcronym(getValue(p.getAcronym()));
re.setContracttype(p.getContracttype());
List<Field<String>> f = p.getFundingtree();
if (!f.isEmpty()) {
re.setFundingtree(f.stream()
.map(s -> ((LinkedHashMap<String, String>) s).get("value"))
.map(s -> s.getValue())
.collect(Collectors.toList()));
}
break;
}
return new EntityRelEntity().setSource(
new TypedRow()
.setSourceId(e.getSource().getSourceId())
.setDeleted(e.getSource().getDeleted())
.setType(e.getSource().getType())
.setOaf(serialize(re)));
return re;
}
private static KeyValue asKV(LinkedHashMap<String, Object> j) {
final KeyValue kv = new KeyValue();
kv.setKey((String) j.get("key"));
kv.setValue((String) j.get("value"));
return kv;
private static String getValue(Field<String> field) {
return getFieldValueWithDefault(field, "");
}
private static void mapTitle(DocumentContext j, RelatedEntity re) {
final JSONArray a = j.read("$.title");
if (!a.isEmpty()) {
final StructuredProperty sp = asStructuredProperty((LinkedHashMap<String, Object>) a.get(0));
if (StringUtils.isNotBlank(sp.getValue())) {
re.setTitle(sp);
}
}
}
private static StructuredProperty asStructuredProperty(LinkedHashMap<String, Object> j) {
final StructuredProperty sp = new StructuredProperty();
final String value = (String) j.get("value");
if (StringUtils.isNotBlank(value)) {
sp.setValue((String) j.get("value"));
sp.setQualifier(asQualifier((LinkedHashMap<String, String>) j.get("qualifier")));
}
return sp;
}
public static Qualifier asQualifier(LinkedHashMap<String, String> j) {
final Qualifier q = new Qualifier();
final String classid = j.get("classid");
if (StringUtils.isNotBlank(classid)) {
q.setClassid(classid);
}
final String classname = j.get("classname");
if (StringUtils.isNotBlank(classname)) {
q.setClassname(classname);
}
final String schemeid = j.get("schemeid");
if (StringUtils.isNotBlank(schemeid)) {
q.setSchemeid(schemeid);
}
final String schemename = j.get("schemename");
if (StringUtils.isNotBlank(schemename)) {
q.setSchemename(schemename);
}
return q;
}
public static String serialize(final Object o) {
try {
return new ObjectMapper()
.setSerializationInclusion(JsonInclude.Include.NON_NULL)
.writeValueAsString(o);
} catch (JsonProcessingException e) {
throw new IllegalArgumentException("unable to serialize: " + o.toString(), e);
}
private static <T> T getFieldValueWithDefault(Field<T> f, T defaultValue) {
return Optional.ofNullable(f)
.filter(Objects::nonNull)
.map(x -> x.getValue())
.orElse(defaultValue);
}
public static String removePrefix(final String s) {

View File

@ -1,6 +1,6 @@
package eu.dnetlib.dhp.oa.provision.utils;
import eu.dnetlib.dhp.oa.provision.model.SortableRelationKey;
import eu.dnetlib.dhp.oa.provision.model.SortableRelation;
import org.apache.spark.Partitioner;
import org.apache.spark.util.Utils;
@ -23,7 +23,7 @@ public class RelationPartitioner extends Partitioner {
@Override
public int getPartition(Object key) {
return Utils.nonNegativeMod(((SortableRelationKey) key).getSourceId().hashCode(), numPartitions());
return Utils.nonNegativeMod(((SortableRelation) key).getSource().hashCode(), numPartitions());
}
}

View File

@ -1,5 +1,6 @@
package eu.dnetlib.dhp.oa.provision.utils;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.google.common.base.Joiner;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
@ -7,9 +8,7 @@ import com.google.common.collect.Maps;
import com.google.common.collect.Sets;
import com.mycila.xmltool.XMLDoc;
import com.mycila.xmltool.XMLTag;
import eu.dnetlib.dhp.oa.provision.model.JoinedEntity;
import eu.dnetlib.dhp.oa.provision.model.RelatedEntity;
import eu.dnetlib.dhp.oa.provision.model.Tuple2;
import eu.dnetlib.dhp.oa.provision.model.*;
import eu.dnetlib.dhp.schema.oaf.Result;
import eu.dnetlib.dhp.schema.oaf.*;
import org.apache.commons.lang3.StringUtils;
@ -50,6 +49,8 @@ public class XmlRecordFactory implements Serializable {
private boolean indent = false;
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
public XmlRecordFactory(
final ContextMapper contextMapper, final boolean indent,
final String schemaLocation, final String otherDatasourceTypesUForUI) {
@ -74,22 +75,24 @@ public class XmlRecordFactory implements Serializable {
final Set<String> contexts = Sets.newHashSet();
final OafEntity entity = je.getEntity();
final OafEntity entity = toOafEntity(je.getEntity());
TemplateFactory templateFactory = new TemplateFactory();
try {
final List<String> metadata = metadata(je.getType(), entity, contexts);
final EntityType type = GraphMappingUtils.EntityType.valueOf(je.getEntity().getType());
final List<String> metadata = metadata(type, entity, contexts);
// rels has to be processed before the contexts because they enrich the contextMap with the funding info.
final List<String> relations = listRelations(je, templateFactory, contexts);
metadata.addAll(buildContexts(getMainType(je.getType()), contexts));
final String mainType = getMainType(type);
metadata.addAll(buildContexts(mainType, contexts));
metadata.add(XmlSerializationUtils.parseDataInfo(entity.getDataInfo()));
final String body = templateFactory.buildBody(
getMainType(je.getType()),
mainType,
metadata,
relations,
listChildren(je, templateFactory), listExtraInfo(je));
listChildren(entity, je.getEntity().getType(), templateFactory), listExtraInfo(entity));
return printXML(templateFactory.buildRecord(entity, schemaLocation, body), indent);
} catch (final Throwable e) {
@ -97,6 +100,35 @@ public class XmlRecordFactory implements Serializable {
}
}
private static OafEntity toOafEntity(TypedRow typedRow) {
return parseOaf(typedRow.getOaf(), typedRow.getType());
}
private static OafEntity parseOaf(final String json, final String type) {
try {
switch (GraphMappingUtils.EntityType.valueOf(type)) {
case publication:
return OBJECT_MAPPER.readValue(json, Publication.class);
case dataset:
return OBJECT_MAPPER.readValue(json, Dataset.class);
case otherresearchproduct:
return OBJECT_MAPPER.readValue(json, OtherResearchProduct.class);
case software:
return OBJECT_MAPPER.readValue(json, Software.class);
case datasource:
return OBJECT_MAPPER.readValue(json, Datasource.class);
case organization:
return OBJECT_MAPPER.readValue(json, Organization.class);
case project:
return OBJECT_MAPPER.readValue(json, Project.class);
default:
throw new IllegalArgumentException("invalid type: " + type);
}
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
private String printXML(String xml, boolean indent) {
try {
final Document doc = new SAXReader().read(new StringReader(xml));
@ -112,7 +144,7 @@ public class XmlRecordFactory implements Serializable {
}
}
private List<String> metadata(final String type, final OafEntity entity, final Set<String> contexts) {
private List<String> metadata(final EntityType type, final OafEntity entity, final Set<String> contexts) {
final List<String> metadata = Lists.newArrayList();
@ -260,11 +292,9 @@ public class XmlRecordFactory implements Serializable {
if (r.getResourcetype() != null) {
metadata.add(XmlSerializationUtils.mapQualifier("resourcetype", r.getResourcetype()));
}
metadata.add(XmlSerializationUtils.mapQualifier("bestaccessright", getBestAccessright(r)));
}
switch (EntityType.valueOf(type)) {
switch (type) {
case publication:
final Publication pub = (Publication) entity;
@ -662,7 +692,7 @@ public class XmlRecordFactory implements Serializable {
if (isNotBlank(re.getCodeRepositoryUrl())) {
metadata.add(XmlSerializationUtils.asXmlElement("coderepositoryurl", re.getCodeRepositoryUrl()));
}
if (re.getResulttype() != null & !re.getResulttype().isBlank()) {
if (re.getResulttype() != null & re.getResulttype().isBlank()) {
metadata.add(XmlSerializationUtils.mapQualifier("resulttype", re.getResulttype()));
}
if (re.getCollectedfrom() != null) {
@ -748,14 +778,14 @@ public class XmlRecordFactory implements Serializable {
return rels;
}
private List<String> listChildren(final JoinedEntity je, TemplateFactory templateFactory) {
private List<String> listChildren(final OafEntity entity, String type, TemplateFactory templateFactory) {
final List<String> children = Lists.newArrayList();
if (MainEntityType.result.toString().equals(getMainType(je.getType()))) {
final List<Instance> instances = ((Result) je.getEntity()).getInstance();
EntityType entityType = EntityType.valueOf(type);
if (MainEntityType.result.toString().equals(getMainType(entityType))) {
final List<Instance> instances = ((Result) entity).getInstance();
if (instances != null) {
for (final Instance instance : ((Result) je.getEntity()).getInstance()) {
for (final Instance instance : ((Result) entity).getInstance()) {
final List<String> fields = Lists.newArrayList();
@ -790,9 +820,9 @@ public class XmlRecordFactory implements Serializable {
children.add(templateFactory.getInstance(instance.getHostedby().getKey(), fields, instance.getUrl()));
}
}
final List<ExternalReference> ext = ((Result) je.getEntity()).getExternalReference();
final List<ExternalReference> ext = ((Result) entity).getExternalReference();
if (ext != null) {
for (final ExternalReference er : ((Result) je.getEntity()).getExternalReference()) {
for (final ExternalReference er : ((Result) entity).getExternalReference()) {
final List<String> fields = Lists.newArrayList();
@ -826,8 +856,8 @@ public class XmlRecordFactory implements Serializable {
return children;
}
private List<String> listExtraInfo(JoinedEntity je) {
final List<ExtraInfo> extraInfo = je.getEntity().getExtraInfo();
private List<String> listExtraInfo(OafEntity entity) {
final List<ExtraInfo> extraInfo = entity.getExtraInfo();
return extraInfo != null ? extraInfo
.stream()
.map(e -> XmlSerializationUtils.mapExtraInfo(e))

View File

@ -1,7 +1,14 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"o", "paramLongName":"outputPath", "paramDescription": "the path used to store temporary output files", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read", "paramRequired": true},
{"paramName":"t", "paramLongName":"otherDsTypeId", "paramDescription": "list of datasource types to populate field datasourcetypeui", "paramRequired": true}
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path of the sequence file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "irp",
"paramLongName": "inputRelationsPath",
"paramDescription": "path to input relations prepare",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "root output location for prepared relations",
"paramRequired": true
},
{
"paramName": "rp",
"paramLongName": "relPartitions",
"paramDescription": "number or partitions for the relations Dataset",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "irp",
"paramLongName": "inputRelationsPath",
"paramDescription": "path to input relations from the graph",
"paramRequired": true
},
{
"paramName": "iep",
"paramLongName": "inputEntityPath",
"paramDescription": "path to input entity from the graph",
"paramRequired": true
},
{
"paramName": "clazz",
"paramLongName": "graphTableClassName",
"paramDescription": "class name associated to the input entity path",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "root output location for prepared relations",
"paramRequired": true
}
]

View File

@ -0,0 +1,32 @@
[
{
"paramName": "issm",
"paramLongName": "isSparkSessionManaged",
"paramDescription": "when true will stop SparkSession after job execution",
"paramRequired": false
},
{
"paramName": "irp",
"paramLongName": "inputRelatedEntitiesPath",
"paramDescription": "path to input relations from the graph",
"paramRequired": true
},
{
"paramName": "iep",
"paramLongName": "inputGraphRootPath",
"paramDescription": "root graph path",
"paramRequired": true
},
{
"paramName": "op",
"paramLongName": "outputPath",
"paramDescription": "root output location for prepared relations",
"paramRequired": true
},
{
"paramName": "np",
"paramLongName": "numPartitions",
"paramDescription": "number of partitions to use for the output",
"paramRequired": true
}
]

View File

@ -1,7 +1,26 @@
[
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequence file to read the XML records", "paramRequired": true},
{"paramName":"f", "paramLongName":"format", "paramDescription": "MDFormat name found in the IS profile", "paramRequired": true},
{"paramName":"b", "paramLongName":"batchSize", "paramDescription": "size of the batch of documents sent to solr", "paramRequired": false}
{
"paramName": "is",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "i",
"paramLongName": "inputPath",
"paramDescription": "the path of the sequence file to read the XML records",
"paramRequired": true
},
{
"paramName": "f",
"paramLongName": "format",
"paramDescription": "MDFormat name found in the IS profile",
"paramRequired": true
},
{
"paramName": "b",
"paramLongName": "batchSize",
"paramDescription": "size of the batch of documents sent to solr",
"paramRequired": false
}
]

View File

@ -0,0 +1,26 @@
[
{
"paramName": "in",
"paramLongName": "inputPath",
"paramDescription": "the path of the sequence file to read",
"paramRequired": true
},
{
"paramName": "out",
"paramLongName": "outputPath",
"paramDescription": "the path used to store temporary output files",
"paramRequired": true
},
{
"paramName": "ilu",
"paramLongName": "isLookupUrl",
"paramDescription": "URL of the isLookUp Service",
"paramRequired": true
},
{
"paramName": "odt",
"paramLongName": "otherDsTypeId",
"paramDescription": "list of datasource types to populate field datasourcetypeui",
"paramRequired": true
}
]

View File

@ -19,16 +19,24 @@
<name>hive_metastore_uris</name>
<value>thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083</value>
</property>
<property>
<name>hive_db_name</name>
<value>openaire</value>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18088</value>
<value>http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089</value>
</property>
<property>
<name>spark2EventLogDir</name>
<value>/user/spark/applicationHistory</value>
<value>/user/spark/spark2ApplicationHistory</value>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorAppListener"</value>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>"com.cloudera.spark.lineage.NavigatorQueryListener"</value>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<value>spark2</value>
</property>
</configuration>

View File

@ -2,21 +2,52 @@
<parameters>
<property>
<name>hive_db_name</name>
<description>the target hive database name</description>
<name>inputGraphRootPath</name>
<description>root location of input materialized graph</description>
</property>
<property>
<name>sparkDriverMemory</name>
<name>isLookupUrl</name>
<description>URL for the isLookup service</description>
</property>
<property>
<name>sparkDriverMemoryForJoining</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemory</name>
<name>sparkExecutorMemoryForJoining</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCores</name>
<name>sparkExecutorCoresForJoining</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>sparkDriverMemoryForIndexing</name>
<description>memory for driver process</description>
</property>
<property>
<name>sparkExecutorMemoryForIndexing</name>
<description>memory for individual executor</description>
</property>
<property>
<name>sparkExecutorCoresForIndexing</name>
<description>number of cores used by single executor</description>
</property>
<property>
<name>oozieActionShareLibForSpark2</name>
<description>oozie action sharelib for spark 2.*</description>
</property>
<property>
<name>spark2ExtraListeners</name>
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
<description>spark 2.* extra listeners classname</description>
</property>
<property>
<name>spark2SqlQueryExecutionListeners</name>
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
<description>spark 2.* sql query execution listeners classname</description>
</property>
<property>
<name>spark2YarnHistoryServerAddress</name>
<description>spark 2.* yarn history server address</description>
@ -32,12 +63,8 @@
<name-node>${nameNode}</name-node>
<configuration>
<property>
<name>mapreduce.job.queuename</name>
<value>${queueName}</value>
</property>
<property>
<name>oozie.launcher.mapred.job.queue.name</name>
<value>${oozieLauncherQueueName}</value>
<name>oozie.action.sharelib.for.spark</name>
<value>${oozieActionShareLibForSpark2}</value>
</property>
</configuration>
</global>
@ -46,9 +73,9 @@
<decision name="reuse_records">
<switch>
<case to="adjancency_lists">${wf:conf('reuseRecords') eq false}</case>
<case to="prepare_relations">${wf:conf('reuseRecords') eq false}</case>
<case to="to_solr_index">${wf:conf('reuseRecords') eq true}</case>
<default to="adjancency_lists"/>
<default to="prepare_relations"/>
</switch>
</decision>
@ -56,31 +83,306 @@
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill>
<action name="adjancency_lists">
<action name="prepare_relations">
<spark xmlns="uri:oozie:spark-action:0.2">
<prepare>
<delete path="${outputPath}"/>
<mkdir path="${outputPath}"/>
</prepare>
<master>yarn</master>
<mode>cluster</mode>
<name>build_adjacency_lists</name>
<class>eu.dnetlib.dhp.oa.provision.SparkXmlRecordBuilderJob</class>
<name>PrepareRelations</name>
<class>eu.dnetlib.dhp.oa.provision.PrepareRelationsJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores ${sparkExecutorCoresForJoining}
--executor-memory ${sparkExecutorMemoryForJoining}
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg> <arg>yarn</arg>
<arg>-is</arg> <arg>${isLookupUrl}</arg>
<arg>-t</arg> <arg>${otherDsTypeId}</arg>
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
<arg>--outputPath</arg><arg>${outputPath}</arg>
<arg>--inputRelationsPath</arg><arg>${inputGraphRootPath}/relation</arg>
<arg>--outputPath</arg><arg>${workingDir}/relation</arg>
<arg>--relPartitions</arg><arg>3000</arg>
</spark>
<ok to="fork_join_related_entities"/>
<error to="Kill"/>
</action>
<fork name="fork_join_related_entities">
<path start="join_relation_publication"/>
<path start="join_relation_dataset"/>
<path start="join_relation_otherresearchproduct"/>
<path start="join_relation_software"/>
<path start="join_relation_datasource"/>
<path start="join_relation_organization"/>
<path start="join_relation_project"/>
</fork>
<action name="join_relation_publication">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = publication.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/publication</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_dataset">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = dataset.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/dataset</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_otherresearchproduct">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = otherresearchproduct.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/otherresearchproduct</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_software">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = software.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/software</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_datasource">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = datasource.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/datasource</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_organization">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = organization.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/organization</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<action name="join_relation_project">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[relation.target = project.id]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase1</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputRelationsPath</arg><arg>${workingDir}/relation</arg>
<arg>--inputEntityPath</arg><arg>${inputGraphRootPath}/project</arg>
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_partial</arg>
</spark>
<ok to="wait_joins"/>
<error to="Kill"/>
</action>
<join name="wait_joins" to="join_all_entities"/>
<action name="join_all_entities">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>Join[entities.id = relatedEntity.source]</name>
<class>eu.dnetlib.dhp.oa.provision.CreateRelatedEntitiesJob_phase2</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputGraphRootPath</arg><arg>${inputGraphRootPath}</arg>
<arg>--inputRelatedEntitiesPath</arg><arg>${workingDir}/join_partial</arg>
<arg>--outputPath</arg><arg>${workingDir}/join_entities</arg>
<arg>--numPartitions</arg><arg>12000</arg>
</spark>
<ok to="adjancency_lists"/>
<error to="Kill"/>
</action>
<action name="adjancency_lists">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>build_adjacency_lists</name>
<class>eu.dnetlib.dhp.oa.provision.AdjacencyListBuilderJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=7680
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg> <arg>${workingDir}/join_entities</arg>
<arg>--outputPath</arg><arg>${workingDir}/joined</arg>
</spark>
<ok to="convert_to_xml"/>
<error to="Kill"/>
</action>
<action name="convert_to_xml">
<spark xmlns="uri:oozie:spark-action:0.2">
<master>yarn</master>
<mode>cluster</mode>
<name>convert_to_xml</name>
<class>eu.dnetlib.dhp.oa.provision.XmlConverterJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores=${sparkExecutorCoresForJoining}
--executor-memory=${sparkExecutorMemoryForJoining}
--driver-memory=${sparkDriverMemoryForJoining}
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
--conf spark.sql.shuffle.partitions=3840
--conf spark.network.timeout=${sparkNetworkTimeout}
</spark-opts>
<arg>--inputPath</arg><arg>${workingDir}/joined</arg>
<arg>--outputPath</arg><arg>${workingDir}/xml</arg>
<arg>--isLookupUrl</arg><arg>${isLookupUrl}</arg>
<arg>--otherDsTypeId</arg><arg>${otherDsTypeId}</arg>
</spark>
<ok to="to_solr_index"/>
<error to="Kill"/>
@ -91,21 +393,20 @@
<master>yarn</master>
<mode>cluster</mode>
<name>to_solr_index</name>
<class>eu.dnetlib.dhp.oa.provision.SparkXmlIndexingJob</class>
<class>eu.dnetlib.dhp.oa.provision.XmlIndexingJob</class>
<jar>dhp-graph-provision-${projectVersion}.jar</jar>
<spark-opts>
--executor-cores ${sparkExecutorCoresForIndexing}
--executor-memory ${sparkExecutorMemoryForIndexing}
--executor-memory=${sparkExecutorMemoryForIndexing}
--driver-memory=${sparkDriverMemoryForIndexing}
--conf spark.dynamicAllocation.enabled=true
--conf spark.dynamicAllocation.maxExecutors=${sparkExecutorCoresForIndexing}
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
--conf spark.extraListeners=${spark2ExtraListeners}
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
</spark-opts>
<arg>-mt</arg> <arg>yarn</arg>
<arg>-is</arg> <arg>${isLookupUrl}</arg>
<arg>--sourcePath</arg><arg>${outputPath}/xml</arg>
<arg>--inputPath</arg><arg>${workingDir}/xml</arg>
<arg>--isLookupUrl</arg> <arg>${isLookupUrl}</arg>
<arg>--format</arg><arg>${format}</arg>
<arg>--batchSize</arg><arg>${batchSize}</arg>
</spark>