forked from D-Net/dnet-hadoop
raw graph creation workflow moved under dhp-graph-mapper, claims integration is included
This commit is contained in:
parent
47f3d9b757
commit
6b5f9ca9cb
|
@ -100,23 +100,11 @@
|
|||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
|
||||
|
||||
<dependency>
|
||||
<groupId>org.apache.hadoop</groupId>
|
||||
<artifactId>hadoop-distcp</artifactId>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
<version>42.2.10</version>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
</project>
|
||||
|
|
|
@ -1,71 +0,0 @@
|
|||
package eu.dnetlib.dhp.migration.step3;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
|
||||
public class DispatchEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DispatchEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/dispatch_entities_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
|
||||
processEntity(sc, Publication.class, sourcePath, targetPath);
|
||||
processEntity(sc, Dataset.class, sourcePath, targetPath);
|
||||
processEntity(sc, Software.class, sourcePath, targetPath);
|
||||
processEntity(sc, OtherResearchProduct.class, sourcePath, targetPath);
|
||||
processEntity(sc, Datasource.class, sourcePath, targetPath);
|
||||
processEntity(sc, Organization.class, sourcePath, targetPath);
|
||||
processEntity(sc, Project.class, sourcePath, targetPath);
|
||||
processEntity(sc, Relation.class, sourcePath, targetPath);
|
||||
}
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(DispatchEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void processEntity(final JavaSparkContext sc, final Class<?> clazz, final String sourcePath, final String targetPath) {
|
||||
final String type = clazz.getSimpleName().toLowerCase();
|
||||
|
||||
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
|
||||
|
||||
sc.textFile(sourcePath)
|
||||
.filter(l -> isEntityType(l, type))
|
||||
.map(l -> StringUtils.substringAfter(l, "|"))
|
||||
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
|
||||
}
|
||||
|
||||
private static boolean isEntityType(final String line, final String type) {
|
||||
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
}
|
|
@ -1,197 +0,0 @@
|
|||
<workflow-app name="import regular entities as Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>workingPath</name>
|
||||
<value>/tmp/dhp_migration</value>
|
||||
<description>the base path to store temporary intermediate data</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>graphBasePath</name>
|
||||
<description>the target path to store raw graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>reuseContent</name>
|
||||
<value>false</value>
|
||||
<description>should import content from the aggregator or reuse a previous version</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ReuseContent"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="ReuseContent">
|
||||
<switch>
|
||||
<case to="ResetWorkingPath">${wf:conf('reuseContent') eq false}</case>
|
||||
<case to="ResetAllEntitiesPath">${wf:conf('reuseContent') eq true}</case>
|
||||
<default to="ResetWorkingPath"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<action name="ResetWorkingPath">
|
||||
<fs>
|
||||
<delete path="${workingPath}"/>
|
||||
<mkdir path="${workingPath}"/>
|
||||
</fs>
|
||||
<ok to="ImportDB"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingPath}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ResetAllEntitiesPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetAllEntitiesPath">
|
||||
<fs>
|
||||
<delete path="${workingPath}/all_entities"/>
|
||||
</fs>
|
||||
<ok to="GenerateEntities"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.migration.step2.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${workingPath}/db_records,${workingPath}/oaf_records,${workingPath}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${workingPath}/all_entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="ResetGraphPath"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ResetGraphPath">
|
||||
<fs>
|
||||
<delete path="${graphBasePath}/graph_raw"/>
|
||||
<mkdir path="${graphBasePath}/graph_raw"/>
|
||||
</fs>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.migration.step3.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-aggregation-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener"
|
||||
--conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"
|
||||
--conf spark.sql.warehouse.dir="/user/hive/warehouse"
|
||||
</spark-opts>
|
||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||
<arg>-s</arg><arg>${workingPath}/all_entities</arg>
|
||||
<arg>-g</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -40,6 +40,7 @@
|
|||
<artifactId>dhp-schemas</artifactId>
|
||||
<version>${project.version}</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>com.jayway.jsonpath</groupId>
|
||||
<artifactId>json-path</artifactId>
|
||||
|
@ -48,6 +49,18 @@
|
|||
<groupId>org.mongodb</groupId>
|
||||
<artifactId>mongo-java-driver</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>dom4j</groupId>
|
||||
<artifactId>dom4j</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>jaxen</groupId>
|
||||
<artifactId>jaxen</artifactId>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
</dependency>
|
||||
|
||||
</dependencies>
|
||||
|
||||
|
|
|
@ -1,45 +1,15 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.keyValue;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.oaiIProvenance;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.DocumentFactory;
|
||||
import org.dom4j.DocumentHelper;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import java.util.*;
|
||||
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
||||
|
||||
public abstract class AbstractMdRecordToOafMapper {
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class DispatchEntitiesApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(DispatchEntitiesApplication.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/dispatch_entities_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String sourcePath = parser.get("sourcePath");
|
||||
final String targetPath = parser.get("graphRawPath");
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, targetPath);
|
||||
|
||||
processEntity(spark, Publication.class, sourcePath, targetPath);
|
||||
processEntity(spark, Dataset.class, sourcePath, targetPath);
|
||||
processEntity(spark, Software.class, sourcePath, targetPath);
|
||||
processEntity(spark, OtherResearchProduct.class, sourcePath, targetPath);
|
||||
processEntity(spark, Datasource.class, sourcePath, targetPath);
|
||||
processEntity(spark, Organization.class, sourcePath, targetPath);
|
||||
processEntity(spark, Project.class, sourcePath, targetPath);
|
||||
processEntity(spark, Relation.class, sourcePath, targetPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Oaf> void processEntity(final SparkSession spark, final Class<T> clazz, final String sourcePath, final String targetPath) {
|
||||
final String type = clazz.getSimpleName().toLowerCase();
|
||||
|
||||
log.info(String.format("Processing entities (%s) in file: %s", type, sourcePath));
|
||||
|
||||
spark.read()
|
||||
.textFile(sourcePath)
|
||||
.filter((FilterFunction<String>) value -> isEntityType(value, type))
|
||||
.map((MapFunction<String, String>) value -> StringUtils.substringAfter(value, "|"), Encoders.STRING())
|
||||
.map((MapFunction<String, T>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz))
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.parquet(targetPath + "/" + type);
|
||||
|
||||
/*
|
||||
JavaSparkContext.fromSparkContext(spark.sparkContext())
|
||||
.textFile(sourcePath)
|
||||
.filter(l -> isEntityType(l, type))
|
||||
.map(l -> StringUtils.substringAfter(l, "|"))
|
||||
.saveAsTextFile(targetPath + "/" + type, GzipCodec.class); // use repartition(XXX) ???
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
private static boolean isEntityType(final String line, final String type) {
|
||||
return StringUtils.substringBefore(line, "|").equalsIgnoreCase(type);
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
}
|
|
@ -1,13 +1,10 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
|
@ -16,37 +13,38 @@ import org.apache.hadoop.fs.FileSystem;
|
|||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaRDD;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.sql.SQLException;
|
||||
import java.util.*;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
|
||||
public class GenerateEntitiesApplication {
|
||||
|
||||
private static final Log log = LogFactory.getLog(GenerateEntitiesApplication.class);
|
||||
private static final Logger log = LoggerFactory.getLogger(GenerateEntitiesApplication.class);
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/migration/generate_entities_parameters.json")));
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/generate_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String sourcePaths = parser.get("sourcePaths");
|
||||
final String targetPath = parser.get("targetPath");
|
||||
|
||||
|
@ -56,31 +54,27 @@ public class GenerateEntitiesApplication {
|
|||
|
||||
final Map<String, String> code2name = loadClassNames(dbUrl, dbUser, dbPassword);
|
||||
|
||||
try (final SparkSession spark = newSparkSession(parser); final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext())) {
|
||||
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
|
||||
generateEntities(sc, code2name, existingSourcePaths, targetPath);
|
||||
}
|
||||
SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
removeOutputDir(spark, targetPath);
|
||||
generateEntities(spark, code2name, sourcePaths, targetPath);
|
||||
});
|
||||
}
|
||||
|
||||
private static SparkSession newSparkSession(final ArgumentApplicationParser parser) {
|
||||
return SparkSession
|
||||
.builder()
|
||||
.appName(GenerateEntitiesApplication.class.getSimpleName())
|
||||
.master(parser.get("master"))
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
private static void generateEntities(final JavaSparkContext sc,
|
||||
private static void generateEntities(final SparkSession spark,
|
||||
final Map<String, String> code2name,
|
||||
final List<String> sourcePaths,
|
||||
final String sourcePaths,
|
||||
final String targetPath) {
|
||||
|
||||
JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
final List<String> existingSourcePaths = Arrays.stream(sourcePaths.split(",")).filter(p -> exists(sc, p)).collect(Collectors.toList());
|
||||
|
||||
log.info("Generate entities from files:");
|
||||
sourcePaths.forEach(log::info);
|
||||
existingSourcePaths.forEach(log::info);
|
||||
|
||||
JavaRDD<String> inputRdd = sc.emptyRDD();
|
||||
|
||||
for (final String sp : sourcePaths) {
|
||||
for (final String sp : existingSourcePaths) {
|
||||
inputRdd = inputRdd.union(sc.sequenceFile(sp, Text.class, Text.class)
|
||||
.map(k -> new Tuple2<>(k._1().toString(), k._2().toString()))
|
||||
.map(k -> convertToListOaf(k._1(), k._2(), code2name))
|
||||
|
@ -88,7 +82,8 @@ public class GenerateEntitiesApplication {
|
|||
.map(oaf -> oaf.getClass().getSimpleName().toLowerCase() + "|" + convertToJson(oaf)));
|
||||
}
|
||||
|
||||
inputRdd.saveAsTextFile(targetPath, GzipCodec.class);
|
||||
inputRdd
|
||||
.saveAsTextFile(targetPath, GzipCodec.class);
|
||||
|
||||
}
|
||||
|
||||
|
@ -163,11 +158,15 @@ public class GenerateEntitiesApplication {
|
|||
|
||||
private static boolean exists(final JavaSparkContext context, final String pathToFile) {
|
||||
try {
|
||||
final FileSystem hdfs = org.apache.hadoop.fs.FileSystem.get(context.hadoopConfiguration());
|
||||
final FileSystem hdfs = FileSystem.get(context.hadoopConfiguration());
|
||||
final Path path = new Path(pathToFile);
|
||||
return hdfs.exists(path);
|
||||
} catch (final IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
}
|
|
@ -0,0 +1,151 @@
|
|||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.FilterFunction;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import scala.Tuple2;
|
||||
import scala.reflect.ClassTag;
|
||||
import scala.reflect.ClassTag$;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import java.util.function.Function;
|
||||
|
||||
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
||||
import static eu.dnetlib.dhp.schema.common.ModelSupport.isSubClass;
|
||||
|
||||
public class MergeClaimsApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MergeClaimsApplication.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/merge_claims_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
Boolean isSparkSessionManaged = Optional
|
||||
.ofNullable(parser.get("isSparkSessionManaged"))
|
||||
.map(Boolean::valueOf)
|
||||
.orElse(Boolean.TRUE);
|
||||
log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
|
||||
|
||||
final String rawGraphPath = parser.get("rawGraphPath");
|
||||
log.info("rawGraphPath: {}", rawGraphPath);
|
||||
|
||||
final String claimsGraphPath = parser.get("claimsGraphPath");
|
||||
log.info("claimsGraphPath: {}", claimsGraphPath);
|
||||
|
||||
final String outputRawGaphPath = parser.get("outputRawGaphPath");
|
||||
log.info("outputRawGaphPath: {}", outputRawGaphPath);
|
||||
|
||||
String graphTableClassName = parser.get("graphTableClassName");
|
||||
log.info("graphTableClassName: {}", graphTableClassName);
|
||||
|
||||
Class<? extends Oaf> clazz = (Class<? extends Oaf>) Class.forName(graphTableClassName);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
|
||||
conf.registerKryoClasses(ModelSupport.getOafModelClasses());
|
||||
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
String type = clazz.getSimpleName().toLowerCase();
|
||||
|
||||
String rawPath = rawGraphPath + "/" + type;
|
||||
String claimPath = claimsGraphPath + "/" + type;
|
||||
String outPath = outputRawGaphPath + "/" + type;
|
||||
|
||||
removeOutputDir(spark, outPath);
|
||||
mergeByType(spark, rawPath, claimPath, outPath, clazz);
|
||||
});
|
||||
}
|
||||
|
||||
private static <T extends Oaf> void mergeByType(SparkSession spark, String rawPath, String claimPath, String outPath, Class<T> clazz) {
|
||||
Dataset<Tuple2<String, T>> raw = readFromPath(spark, rawPath, clazz)
|
||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
|
||||
final JavaSparkContext jsc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
Dataset<Tuple2<String, T>> claim = jsc.broadcast(readFromPath(spark, claimPath, clazz))
|
||||
.getValue()
|
||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
|
||||
/*
|
||||
Dataset<Tuple2<String, T>> claim = readFromPath(spark, claimPath, clazz)
|
||||
.map((MapFunction<T, Tuple2<String, T>>) value -> new Tuple2<>(idFn().apply(value), value), Encoders.tuple(Encoders.STRING(), Encoders.kryo(clazz)));
|
||||
*/
|
||||
|
||||
raw.joinWith(claim, raw.col("_1").equalTo(claim.col("_1")), "full_outer")
|
||||
.map((MapFunction<Tuple2<Tuple2<String, T>, Tuple2<String, T>>, T>) value -> {
|
||||
|
||||
Optional<Tuple2<String, T>> opRaw = Optional.ofNullable(value._1());
|
||||
Optional<Tuple2<String, T>> opClaim = Optional.ofNullable(value._2());
|
||||
|
||||
return opRaw.isPresent() ? opRaw.get()._2() : opClaim.isPresent() ? opClaim.get()._2() : null;
|
||||
}, Encoders.bean(clazz))
|
||||
.filter(Objects::nonNull)
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.parquet(outPath);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Dataset<T> readFromPath(SparkSession spark, String path, Class<T> clazz) {
|
||||
return spark.read()
|
||||
.load(path)
|
||||
.as(Encoders.bean(clazz))
|
||||
.filter((FilterFunction<T>) value -> Objects.nonNull(idFn().apply(value)));
|
||||
}
|
||||
|
||||
private static void removeOutputDir(SparkSession spark, String path) {
|
||||
HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
|
||||
}
|
||||
|
||||
private static <T extends Oaf> Function<T, String> idFn() {
|
||||
return x -> {
|
||||
if (isSubClass(x, Relation.class)) {
|
||||
return idFnForRelation(x);
|
||||
}
|
||||
return idFnForOafEntity(x);
|
||||
};
|
||||
}
|
||||
|
||||
private static <T extends Oaf> String idFnForRelation(T t) {
|
||||
Relation r = (Relation) t;
|
||||
return Optional.ofNullable(r.getSource())
|
||||
.map(source -> Optional.ofNullable(r.getTarget())
|
||||
.map(target -> Optional.ofNullable(r.getRelType())
|
||||
.map(relType -> Optional.ofNullable(r.getSubRelType())
|
||||
.map(subRelType -> Optional.ofNullable(r.getRelClass())
|
||||
.map(relClass -> String.join(source, target, relType, subRelType, relClass))
|
||||
.orElse(String.join(source, target, relType, subRelType))
|
||||
)
|
||||
.orElse(String.join(source, target, relType))
|
||||
)
|
||||
.orElse(String.join(source, target))
|
||||
)
|
||||
.orElse(source)
|
||||
)
|
||||
.orElse(null);
|
||||
}
|
||||
|
||||
private static <T extends Oaf> String idFnForOafEntity(T t) {
|
||||
return ((OafEntity) t).getId();
|
||||
}
|
||||
|
||||
}
|
|
@ -1,14 +1,13 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.asString;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.dataInfo;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.journal;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listFields;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.listKeyValues;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.qualifier;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -22,31 +21,7 @@ import java.util.List;
|
|||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.DbClient;
|
||||
import eu.dnetlib.dhp.schema.oaf.Context;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Datasource;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Organization;
|
||||
import eu.dnetlib.dhp.schema.oaf.OtherResearchProduct;
|
||||
import eu.dnetlib.dhp.schema.oaf.Project;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Result;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
||||
|
||||
public class MigrateDbEntitiesApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
|
@ -61,7 +36,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_db_entities_parameters.json")));
|
||||
IOUtils.toString(MigrateDbEntitiesApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_db_entities_parameters.json")));
|
||||
|
||||
parser.parseArgument(args);
|
||||
|
||||
|
@ -111,7 +86,7 @@ public class MigrateDbEntitiesApplication extends AbstractMigrationApplication i
|
|||
}
|
||||
|
||||
public void execute(final String sqlFile, final Function<ResultSet, List<Oaf>> producer) throws Exception {
|
||||
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/migration/sql/" + sqlFile));
|
||||
final String sql = IOUtils.toString(getClass().getResourceAsStream("/eu/dnetlib/dhp/oa/graph/sql/" + sqlFile));
|
||||
|
||||
final Consumer<ResultSet> consumer = rs -> producer.apply(rs).forEach(oaf -> emitOaf(oaf));
|
||||
|
|
@ -1,18 +1,17 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.MdstoreClient;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.migration.utils.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.migration.utils.MdstoreClient;
|
||||
|
||||
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(MigrateMongoMdstoresApplication.class);
|
||||
|
@ -21,7 +20,7 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
|
|||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/migration/migrate_mongo_mstores_parameters.json")));
|
||||
IOUtils.toString(MigrateMongoMdstoresApplication.class.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
|
@ -1,27 +1,17 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.migration.utils.PacePerson;
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.field;
|
||||
|
||||
public class OafToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
|
@ -1,28 +1,16 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.createOpenaireId;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.field;
|
||||
import static eu.dnetlib.dhp.migration.utils.OafMapperUtils.structuredProperty;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.dom4j.Document;
|
||||
import org.dom4j.Node;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Author;
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.GeoLocation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Instance;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import static eu.dnetlib.dhp.oa.graph.raw.common.OafMapperUtils.*;
|
||||
|
||||
public class OdfToOafMapper extends AbstractMdRecordToOafMapper {
|
||||
|
|
@ -1,9 +1,6 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
|
@ -12,7 +9,9 @@ import org.apache.hadoop.io.SequenceFile;
|
|||
import org.apache.hadoop.io.Text;
|
||||
import org.codehaus.jackson.map.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
public class AbstractMigrationApplication implements Closeable {
|
||||
|
|
@ -1,18 +1,14 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.Connection;
|
||||
import java.sql.DriverManager;
|
||||
import java.sql.ResultSet;
|
||||
import java.sql.SQLException;
|
||||
import java.sql.Statement;
|
||||
import java.util.function.Consumer;
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
import java.sql.*;
|
||||
import java.util.function.Consumer;
|
||||
|
||||
public class DbClient implements Closeable {
|
||||
|
||||
private static final Log log = LogFactory.getLog(DbClient.class);
|
|
@ -1,4 +1,14 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.bson.Document;
|
||||
|
||||
import java.io.Closeable;
|
||||
import java.io.IOException;
|
||||
|
@ -7,17 +17,6 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.stream.StreamSupport;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.bson.Document;
|
||||
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
import com.mongodb.client.MongoCollection;
|
||||
import com.mongodb.client.MongoDatabase;
|
||||
|
||||
public class MdstoreClient implements Closeable {
|
||||
|
||||
private final MongoClient client;
|
|
@ -1,4 +1,8 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
|
@ -6,19 +10,6 @@ import java.util.List;
|
|||
import java.util.Objects;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.DataInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.ExtraInfo;
|
||||
import eu.dnetlib.dhp.schema.oaf.Field;
|
||||
import eu.dnetlib.dhp.schema.oaf.Journal;
|
||||
import eu.dnetlib.dhp.schema.oaf.KeyValue;
|
||||
import eu.dnetlib.dhp.schema.oaf.OAIProvenance;
|
||||
import eu.dnetlib.dhp.schema.oaf.OriginDescription;
|
||||
import eu.dnetlib.dhp.schema.oaf.Qualifier;
|
||||
import eu.dnetlib.dhp.schema.oaf.StructuredProperty;
|
||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||
|
||||
public class OafMapperUtils {
|
||||
|
||||
public static KeyValue keyValue(final String k, final String v) {
|
|
@ -1,19 +1,18 @@
|
|||
package eu.dnetlib.dhp.migration.utils;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Splitter;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.hash.Hashing;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.text.WordUtils;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.text.Normalizer;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
public class PacePerson {
|
||||
|
||||
|
@ -105,7 +104,7 @@ public class PacePerson {
|
|||
|
||||
private List<String> splitTerms(final String s) {
|
||||
if (particles == null) {
|
||||
particles = loadFromClasspath("/eu/dnetlib/dhp/migration/pace/name_particles.txt");
|
||||
particles = loadFromClasspath("/eu/dnetlib/dhp/oa/graph/pace/name_particles.txt");
|
||||
}
|
||||
|
||||
final List<String> list = Lists.newArrayList();
|
|
@ -1,16 +1,16 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePath",
|
||||
"paramDescription": "the source path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "g",
|
||||
"paramLongName": "graphRawPath",
|
|
@ -1,16 +1,16 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "s",
|
||||
"paramLongName": "sourcePaths",
|
||||
"paramDescription": "the HDFS source paths which contains the sequential file (comma separated)",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "t",
|
||||
"paramLongName": "targetPath",
|
|
@ -0,0 +1,32 @@
|
|||
[
|
||||
{
|
||||
"paramName": "issm",
|
||||
"paramLongName": "isSparkSessionManaged",
|
||||
"paramDescription": "when true will stop SparkSession after job execution",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rgp",
|
||||
"paramLongName": "rawGraphPath",
|
||||
"paramDescription": "the raw graph path",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "cgp",
|
||||
"paramLongName": "claimsGraphPath",
|
||||
"paramDescription": "the path of the claims graph",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "ogp",
|
||||
"paramLongName": "outputRawGaphPath",
|
||||
"paramDescription": "the path of output graph, combining raw and claims",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "clazz",
|
||||
"paramLongName": "graphTableClassName",
|
||||
"paramDescription": "class name associated to the input entity path",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
[
|
||||
{"paramName":"is", "paramLongName":"isLookupUrl", "paramDescription": "URL of the isLookUp Service", "paramRequired": true},
|
||||
{"paramName":"sn", "paramLongName":"sourceNameNode", "paramDescription": "nameNode of the source cluster", "paramRequired": true},
|
||||
{"paramName":"tn", "paramLongName":"targetNameNode", "paramDescription": "namoNode of the target cluster", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workingDirectory", "paramDescription": "working directory", "paramRequired": true},
|
||||
{"paramName":"nm", "paramLongName":"distcp_num_maps", "paramDescription": "maximum number of map tasks used in the distcp process", "paramRequired": true},
|
||||
{"paramName":"mm", "paramLongName":"distcp_memory_mb", "paramDescription": "memory for distcp action copying actionsets from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tt", "paramLongName":"distcp_task_timeout", "paramDescription": "timeout for distcp copying actions from remote cluster", "paramRequired": true},
|
||||
{"paramName":"tr", "paramLongName":"transform_only", "paramDescription": "activate tranform-only mode. Only apply transformation step", "paramRequired": true}
|
||||
]
|
|
@ -0,0 +1,538 @@
|
|||
<workflow-app name="create RAW Graph (all steps)" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphBasePath</name>
|
||||
<description>the target path to store raw graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>reuseContent</name>
|
||||
<value>false</value>
|
||||
<description>should import content from the aggregator or reuse a previous version</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresURL</name>
|
||||
<description>the postgres URL to access to the database</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresUser</name>
|
||||
<description>the user postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>postgresPassword</name>
|
||||
<description>the password postgres</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoURL</name>
|
||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mongoDb</name>
|
||||
<description>mongo database</description>
|
||||
</property>
|
||||
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="reuse_aggregator_content"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<decision name="reuse_aggregator_content">
|
||||
<switch>
|
||||
<case to="start_import">${wf:conf('reuseContent') eq false}</case>
|
||||
<case to="fork_generate_entities">${wf:conf('reuseContent') eq true}</case>
|
||||
<default to="start_import"/>
|
||||
</switch>
|
||||
</decision>
|
||||
|
||||
<fork name="start_import">
|
||||
<path start="ImportDB"/>
|
||||
<path start="ImportDB_claims"/>
|
||||
</fork>
|
||||
|
||||
<action name="ImportDB_claims">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/db_claims"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/db_claims</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
<arg>-a</arg><arg>claims</arg>
|
||||
</java>
|
||||
<ok to="ImportODF_claims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF_claims">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/odf_claims"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/odf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF_claims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF_claims">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/oaf_claims"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/oaf_claims</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>claim</arg>
|
||||
</java>
|
||||
<ok to="wait_import"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportDB">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/db_records"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateDbEntitiesApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/db_records</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</java>
|
||||
<ok to="ImportODF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportODF">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/odf_records"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/odf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>ODF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="ImportOAF"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="ImportOAF">
|
||||
<java>
|
||||
<prepare>
|
||||
<delete path="${workingDir}/oaf_records"/>
|
||||
</prepare>
|
||||
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||
<arg>-p</arg><arg>${workingDir}/oaf_records</arg>
|
||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||
<arg>-f</arg><arg>OAF</arg>
|
||||
<arg>-l</arg><arg>store</arg>
|
||||
<arg>-i</arg><arg>cleaned</arg>
|
||||
</java>
|
||||
<ok to="wait_import"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_import" to="fork_generate_entities"/>
|
||||
|
||||
<fork name="fork_generate_entities">
|
||||
<path start="GenerateEntities_claim"/>
|
||||
<path start="GenerateEntities"/>
|
||||
</fork>
|
||||
|
||||
<action name="GenerateEntities_claim">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities_claim</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-s</arg><arg>${workingDir}/db_claims,${workingDir}/oaf_claims,${workingDir}/odf_claims</arg>
|
||||
<arg>-t</arg><arg>${workingDir}/entities_claim</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="GenerateGraph_claims"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph_claims">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph_claims</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-s</arg><arg>${workingDir}/entities_claim</arg>
|
||||
<arg>-g</arg><arg>${workingDir}/graph_claims</arg>
|
||||
</spark>
|
||||
<ok to="wait_graphs"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateEntities">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateEntities</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.GenerateEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>-s</arg><arg>${workingDir}/db_records,${workingDir}/oaf_records,${workingDir}/odf_records</arg>
|
||||
<arg>-t</arg><arg>${workingDir}/entities</arg>
|
||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
||||
</spark>
|
||||
<ok to="GenerateGraph"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="GenerateGraph">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>GenerateGraph</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.DispatchEntitiesApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>-s</arg><arg>${workingDir}/entities</arg>
|
||||
<arg>-g</arg><arg>${workingDir}/graph_raw</arg>
|
||||
</spark>
|
||||
<ok to="wait_graphs"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_graphs" to="fork_merge_claims"/>
|
||||
|
||||
<fork name="fork_merge_claims">
|
||||
<path start="merge_claims_publication"/>
|
||||
<path start="merge_claims_dataset"/>
|
||||
<path start="merge_claims_software"/>
|
||||
<path start="merge_claims_otherresearchproduct"/>
|
||||
<path start="merge_claims_datasource"/>
|
||||
<path start="merge_claims_organization"/>
|
||||
<path start="merge_claims_project"/>
|
||||
<path start="merge_claims_relation"/>
|
||||
</fork>
|
||||
|
||||
|
||||
<action name="merge_claims_publication">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_publication</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Publication</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_dataset">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_dataset</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=7680
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Dataset</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_relation">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_relation</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=3840
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Relation</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_software">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_software</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=1920
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Software</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_otherresearchproduct">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_otherresearchproduct</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=1920
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.OtherResearchProduct</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_datasource">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_datasource</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=200
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Datasource</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_organization">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_organization</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=200
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Organization</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<action name="merge_claims_project">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>MergeClaims_project</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MergeClaimsApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
--conf spark.sql.shuffle.partitions=200
|
||||
</spark-opts>
|
||||
<arg>--rawGraphPath</arg><arg>${workingDir}/graph_raw</arg>
|
||||
<arg>--claimsGraphPath</arg><arg>${workingDir}/graph_claims</arg>
|
||||
<arg>--outputRawGaphPath</arg><arg>${graphBasePath}/graph_raw</arg>
|
||||
<arg>--graphTableClassName</arg><arg>eu.dnetlib.dhp.schema.oaf.Project</arg>
|
||||
</spark>
|
||||
<ok to="wait_merge"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
<join name="wait_merge" to="End"/>
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
|
@ -0,0 +1,20 @@
|
|||
[
|
||||
{
|
||||
"paramName": "mt",
|
||||
"paramLongName": "master",
|
||||
"paramDescription": "should be local or yarn",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "is",
|
||||
"paramLongName": "isLookupUrl",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "i",
|
||||
"paramLongName": "inputPaths",
|
||||
"paramDescription": "URL of the isLookUp Service",
|
||||
"paramRequired": true
|
||||
}
|
||||
]
|
|
@ -1,34 +1,25 @@
|
|||
package eu.dnetlib.dhp.oa.graph;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
import eu.dnetlib.dhp.schema.common.ModelSupport;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.lang3.RandomStringUtils;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.junit.jupiter.api.io.TempDir;
|
||||
import org.junit.jupiter.api.AfterAll;
|
||||
import org.junit.jupiter.api.Assertions;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.Objects;
|
||||
|
||||
public class GraphHiveImporterJobTest {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(GraphHiveImporterJobTest.class);
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static final ClassLoader cl = GraphHiveImporterJobTest.class.getClassLoader();
|
||||
|
||||
public static final String JDBC_DERBY_TEMPLATE = "jdbc:derby:;databaseName=%s/junit_metastore_db;create=true";
|
||||
|
||||
private static SparkSession spark;
|
||||
|
|
|
@ -1,27 +1,24 @@
|
|||
package eu.dnetlib.dhp.migration.step2;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import com.google.common.collect.Maps;
|
||||
import eu.dnetlib.dhp.schema.oaf.*;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.junit.jupiter.api.BeforeAll;
|
||||
import org.junit.jupiter.api.BeforeEach;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import org.junit.jupiter.api.extension.ExtendWith;
|
||||
import org.mockito.Mock;
|
||||
import org.mockito.junit.jupiter.MockitoExtension;
|
||||
|
||||
import eu.dnetlib.dhp.schema.oaf.Dataset;
|
||||
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||
import eu.dnetlib.dhp.schema.oaf.Publication;
|
||||
import eu.dnetlib.dhp.schema.oaf.Relation;
|
||||
import eu.dnetlib.dhp.schema.oaf.Software;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
import static org.mockito.ArgumentMatchers.anyString;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
@ExtendWith(MockitoExtension.class)
|
||||
public class MappersTest {
|
||||
|
@ -30,7 +27,7 @@ public class MappersTest {
|
|||
private Map<String, String> code2name;
|
||||
|
||||
@BeforeEach
|
||||
void setUp() throws Exception {
|
||||
public void setUp() throws Exception {
|
||||
when(code2name.get(anyString())).thenAnswer(invocation -> invocation.getArgument(0));
|
||||
}
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
package eu.dnetlib.dhp.migration.step1;
|
||||
package eu.dnetlib.dhp.oa.graph.raw;
|
||||
|
||||
import com.fasterxml.jackson.core.type.TypeReference;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
6
pom.xml
6
pom.xml
|
@ -346,6 +346,12 @@
|
|||
<artifactId>mongo-java-driver</artifactId>
|
||||
<version>${mongodb.driver.version}</version>
|
||||
</dependency>
|
||||
<dependency>
|
||||
<groupId>org.postgresql</groupId>
|
||||
<artifactId>postgresql</artifactId>
|
||||
<version>42.2.10</version>
|
||||
</dependency>
|
||||
|
||||
<dependency>
|
||||
<groupId>org.antlr</groupId>
|
||||
<artifactId>stringtemplate</artifactId>
|
||||
|
|
Loading…
Reference in New Issue