re-packaged graph dump workflow sources

This commit is contained in:
Claudio Atzori 2020-11-05 17:38:18 +01:00
parent 144216fb88
commit d10447e747
43 changed files with 50 additions and 77 deletions

View File

@ -17,7 +17,7 @@ import com.google.gson.Gson;
import eu.dnetlib.dhp.common.HdfsSupport;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.oa.graph.dump.graph.Constants;
import eu.dnetlib.dhp.oa.graph.dump.complete.Constants;
import eu.dnetlib.dhp.utils.DHPUtils;
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.List;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.BufferedWriter;
import java.io.IOException;
@ -38,7 +38,7 @@ public class CreateContextEntities implements Serializable {
.toString(
CreateContextEntities.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_entity_parameter.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.BufferedWriter;
import java.io.IOException;
@ -44,7 +44,7 @@ public class CreateContextRelation implements Serializable {
.toString(
CreateContextRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_entity_parameter.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_entity_parameter.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -26,7 +26,6 @@ import eu.dnetlib.dhp.schema.dump.oaf.graph.*;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Funder;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Project;
import eu.dnetlib.dhp.schema.oaf.Field;
import eu.dnetlib.dhp.schema.oaf.H2020Programme;
import eu.dnetlib.dhp.schema.oaf.Journal;
import eu.dnetlib.dhp.schema.oaf.OafEntity;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.util.ArrayList;
import java.util.HashMap;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.ArrayList;

View File

@ -1,13 +1,9 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.StringReader;
import java.sql.ResultSet;
import java.sql.SQLException;
import java.sql.Statement;
import java.util.*;
import java.util.function.Consumer;
import java.util.stream.Collectors;
import org.dom4j.Document;
import org.dom4j.DocumentException;
@ -16,8 +12,6 @@ import org.dom4j.Node;
import org.dom4j.io.SAXReader;
import org.jetbrains.annotations.NotNull;
import eu.dnetlib.dhp.schema.common.ModelSupport;
import eu.dnetlib.dhp.schema.dump.oaf.graph.ResearchInitiative;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpException;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -31,7 +31,7 @@ public class SparkCollectAndSave implements Serializable {
.toString(
SparkCollectAndSave.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_collect_and_save.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_collect_and_save.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.Optional;
@ -22,7 +22,7 @@ public class SparkDumpEntitiesJob implements Serializable {
.toString(
SparkDumpEntitiesJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -7,7 +7,6 @@ import java.io.Serializable;
import java.util.Optional;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.function.MapFunction;
import org.apache.spark.sql.Dataset;
@ -17,8 +16,6 @@ import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.gson.Gson;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.common.ModelSupport;
@ -40,7 +37,7 @@ public class SparkDumpRelationJob implements Serializable {
.toString(
SparkDumpRelationJob.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_relationdump_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_relationdump_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.Serializable;
import java.util.*;
@ -9,9 +9,6 @@ import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.oa.graph.dump.QueryInformationSystem;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.oaf.Result;
/**

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
@ -39,7 +39,7 @@ public class SparkOrganizationRelation implements Serializable {
.toString(
SparkOrganizationRelation.class
.getResourceAsStream(
"/eu/dnetlib/dhp/oa/graph/dump_whole/input_organization_parameters.json"));
"/eu/dnetlib/dhp/oa/graph/dump/complete/input_organization_parameters.json"));
final ArgumentApplicationParser parser = new ArgumentApplicationParser(jsonConfiguration);
parser.parseArgument(args);

View File

@ -156,7 +156,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -182,7 +182,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -208,7 +208,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -234,7 +234,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -260,7 +260,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table organization </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -286,7 +286,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table project </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -312,7 +312,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table datasource </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpEntitiesJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpEntitiesJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -338,7 +338,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkDumpRelationJob</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkDumpRelationJob</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -367,7 +367,7 @@
<action name="create_entities_fromcontext">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.graph.CreateContextEntities</main-class>
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextEntities</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/collect/communities_infrastructures</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
@ -378,7 +378,7 @@
<action name="create_relation_fromcontext">
<java>
<main-class>eu.dnetlib.dhp.oa.graph.dump.graph.CreateContextRelation</main-class>
<main-class>eu.dnetlib.dhp.oa.graph.dump.complete.CreateContextRelation</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/relation/context</arg>
<arg>--nameNode</arg><arg>${nameNode}</arg>
<arg>--isLookUpUrl</arg><arg>${isLookUpUrl}</arg>
@ -392,7 +392,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table relation </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkOrganizationRelation</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkOrganizationRelation</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -427,7 +427,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Extract Relations from publication </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -453,7 +453,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table dataset </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -479,7 +479,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table ORP </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -505,7 +505,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Dump table software </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkExtractRelationFromEntities</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkExtractRelationFromEntities</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}
@ -533,7 +533,7 @@
<master>yarn</master>
<mode>cluster</mode>
<name>Collect Results and Relations and put them in the right path </name>
<class>eu.dnetlib.dhp.oa.graph.dump.graph.SparkCollectAndSave</class>
<class>eu.dnetlib.dhp.oa.graph.dump.complete.SparkCollectAndSave</class>
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
<spark-opts>
--executor-memory=${sparkExecutorMemory}

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static org.mockito.Mockito.lenient;
@ -7,7 +7,6 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.function.Consumer;
import org.junit.jupiter.api.Assertions;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.util.*;
import java.util.function.Consumer;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException;
import java.nio.file.Files;
@ -72,7 +72,7 @@ public class DumpOrganizationProjectDatasourceTest {
public void dumpOrganizationTest() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/organization")
.getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/organization")
.getPath();
DumpGraphEntities dg = new DumpGraphEntities();
@ -101,7 +101,7 @@ public class DumpOrganizationProjectDatasourceTest {
public void dumpProjectTest() {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/project")
.getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/project")
.getPath();
DumpGraphEntities dg = new DumpGraphEntities();
@ -129,7 +129,7 @@ public class DumpOrganizationProjectDatasourceTest {
@Test
public void dumpDatasourceTest() {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/datasource")
.getResource("/eu/dnetlib/dhp/oa/graph/dump/complete/datasource")
.getPath();
DumpGraphEntities dg = new DumpGraphEntities();

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException;
import java.nio.file.Files;

View File

@ -1,31 +1,23 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.*;
import org.junit.jupiter.api.extension.ExtendWith;
import org.mockito.Mock;
import org.mockito.junit.jupiter.MockitoExtension;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.community.CommunityMap;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
public class ExtractRelationFromEntityTest {

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import org.dom4j.DocumentException;
import org.junit.jupiter.api.Assertions;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import static org.mockito.Mockito.lenient;

View File

@ -1,5 +1,5 @@
package eu.dnetlib.dhp.oa.graph.dump.graph;
package eu.dnetlib.dhp.oa.graph.dump.complete;
import java.io.IOException;
import java.nio.file.Files;
@ -7,13 +7,10 @@ import java.nio.file.Path;
import java.util.HashMap;
import org.apache.commons.io.FileUtils;
import org.apache.neethi.Assertion;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SparkSession;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.Assertions;
@ -24,9 +21,7 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.dhp.oa.graph.dump.Utils;
import eu.dnetlib.dhp.schema.dump.oaf.graph.Relation;
import eu.dnetlib.dhp.utils.DHPUtils;
public class RelationFromOrganizationTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@ -77,7 +72,7 @@ public class RelationFromOrganizationTest {
public void test1() throws Exception {
final String sourcePath = getClass()
.getResource("/eu/dnetlib/dhp/oa/graph/dump/graph/relation")
.getResource("/eu/dnetlib/dhp/oa/graph/dump/relation")
.getPath();
final String communityMapPath = getClass()