forked from D-Net/dnet-hadoop
align parmeter names, graph import procedure WIP
This commit is contained in:
parent
f39148dab8
commit
1e7a2ac41d
|
@ -27,8 +27,8 @@ import java.util.Comparator;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
|
||||||
|
|
||||||
public class TransformationJobTest {
|
public class TransformationJobTest {
|
||||||
|
|
||||||
@Mock
|
@Mock
|
||||||
LongAccumulator accumulator;
|
LongAccumulator accumulator;
|
||||||
|
|
||||||
|
@ -42,9 +42,8 @@ public class TransformationJobTest {
|
||||||
testDir = Files.createTempDirectory("dhp-collection");
|
testDir = Files.createTempDirectory("dhp-collection");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@After
|
@After
|
||||||
public void teadDown() throws IOException {
|
public void tearDown() throws IOException {
|
||||||
FileUtils.deleteDirectory(testDir.toFile());
|
FileUtils.deleteDirectory(testDir.toFile());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -90,11 +89,8 @@ public class TransformationJobTest {
|
||||||
"-rh", "",
|
"-rh", "",
|
||||||
"-ro", "",
|
"-ro", "",
|
||||||
"-rr", ""});
|
"-rr", ""});
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void tryLoadFolderOnCP() throws Exception {
|
public void tryLoadFolderOnCP() throws Exception {
|
||||||
final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
final String path = this.getClass().getResource("/eu/dnetlib/dhp/transform/mdstorenative").getFile();
|
||||||
|
@ -102,7 +98,6 @@ public class TransformationJobTest {
|
||||||
|
|
||||||
Path tempDirWithPrefix = Files.createTempDirectory("mdsotre_output");
|
Path tempDirWithPrefix = Files.createTempDirectory("mdsotre_output");
|
||||||
|
|
||||||
|
|
||||||
System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());
|
System.out.println(tempDirWithPrefix.toFile().getAbsolutePath());
|
||||||
|
|
||||||
Files.deleteIfExists(tempDirWithPrefix);
|
Files.deleteIfExists(tempDirWithPrefix);
|
||||||
|
@ -140,10 +135,6 @@ public class TransformationJobTest {
|
||||||
Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
|
Node node = document.selectSingleNode("//CODE/*[local-name()='stylesheet']");
|
||||||
|
|
||||||
System.out.println(node.asXML());
|
System.out.println(node.asXML());
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,23 @@
|
||||||
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
|
import com.google.common.collect.Maps;
|
||||||
|
import eu.dnetlib.dhp.schema.oaf.*;
|
||||||
|
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class GraphMappingUtils {
|
||||||
|
|
||||||
|
public final static Map<String, Class> types = Maps.newHashMap();
|
||||||
|
|
||||||
|
static {
|
||||||
|
types.put("datasource", Datasource.class);
|
||||||
|
types.put("organization", Organization.class);
|
||||||
|
types.put("project", Project.class);
|
||||||
|
types.put("dataset", Dataset.class);
|
||||||
|
types.put("otherresearchproduct", OtherResearchProduct.class);
|
||||||
|
types.put("software", Software.class);
|
||||||
|
types.put("publication", Publication.class);
|
||||||
|
types.put("relation", Relation.class);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,9 +1,7 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.apache.commons.lang.StringUtils;
|
import org.apache.commons.lang.StringUtils;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
|
@ -13,8 +11,6 @@ import org.apache.spark.sql.Encoders;
|
||||||
import org.apache.spark.sql.SparkSession;
|
import org.apache.spark.sql.SparkSession;
|
||||||
import scala.Tuple2;
|
import scala.Tuple2;
|
||||||
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
public class SparkGraphImporterJob {
|
public class SparkGraphImporterJob {
|
||||||
|
|
||||||
public static void main(String[] args) throws Exception {
|
public static void main(String[] args) throws Exception {
|
||||||
|
@ -27,8 +23,8 @@ public class SparkGraphImporterJob {
|
||||||
.master(parser.get("master"))
|
.master(parser.get("master"))
|
||||||
.getOrCreate();
|
.getOrCreate();
|
||||||
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
final String inputPath = parser.get("input");
|
final String inputPath = parser.get("sourcePath");
|
||||||
final String outputPath = parser.get("outputDir");
|
final String outputPath = parser.get("targetPath");
|
||||||
|
|
||||||
final String filter = parser.get("filter");
|
final String filter = parser.get("filter");
|
||||||
|
|
||||||
|
@ -36,17 +32,7 @@ public class SparkGraphImporterJob {
|
||||||
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class)
|
final JavaRDD<Tuple2<String, String>> inputRDD = sc.sequenceFile(inputPath, Text.class, Text.class)
|
||||||
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
|
.map(item -> new Tuple2<>(item._1.toString(), item._2.toString()));
|
||||||
|
|
||||||
final Map<String, Class> types = Maps.newHashMap();
|
GraphMappingUtils.types.forEach((name, clazz) -> {
|
||||||
types.put("datasource", Datasource.class);
|
|
||||||
types.put("organization", Organization.class);
|
|
||||||
types.put("project", Project.class);
|
|
||||||
types.put("dataset", Dataset.class);
|
|
||||||
types.put("otherresearchproduct", OtherResearchProduct.class);
|
|
||||||
types.put("software", Software.class);
|
|
||||||
types.put("publication", Publication.class);
|
|
||||||
types.put("relation", Relation.class);
|
|
||||||
|
|
||||||
types.forEach((name, clazz) -> {
|
|
||||||
if (StringUtils.isNotBlank(filter) || filter.toLowerCase().contains(name)) {
|
if (StringUtils.isNotBlank(filter) || filter.toLowerCase().contains(name)) {
|
||||||
spark.createDataset(inputRDD
|
spark.createDataset(inputRDD
|
||||||
.filter(s -> s._1().equals(clazz.getName()))
|
.filter(s -> s._1().equals(clazz.getName()))
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
[
|
[
|
||||||
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
{"paramName":"mt", "paramLongName":"master", "paramDescription": "should be local or yarn", "paramRequired": true},
|
||||||
{"paramName":"i", "paramLongName":"input", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
{"paramName":"s", "paramLongName":"sourcePath", "paramDescription": "the path of the sequencial file to read", "paramRequired": true},
|
||||||
{"paramName":"f", "paramLongName":"filter", "paramDescription": "csv of typology of dataframe to be generated", "paramRequired": false},
|
{"paramName":"f", "paramLongName":"filter", "paramDescription": "csv of typology of dataframe to be generated", "paramRequired": false},
|
||||||
{"paramName":"o", "paramLongName":"outputDir", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true}
|
{"paramName":"t", "paramLongName":"targetPath", "paramDescription": "the path where store DataFrames on HDFS", "paramRequired": true}
|
||||||
]
|
]
|
|
@ -43,8 +43,8 @@
|
||||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||||
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"</spark-opts>
|
<spark-opts>--executor-memory ${sparkExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory=${sparkDriverMemory} --conf spark.extraListeners="com.cloudera.spark.lineage.NavigatorAppListener" --conf spark.sql.queryExecutionListeners="com.cloudera.spark.lineage.NavigatorQueryListener"</spark-opts>
|
||||||
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
<arg>-mt</arg> <arg>yarn-cluster</arg>
|
||||||
<arg>--input</arg><arg>${sourcePath}</arg>
|
<arg>--sourcePath</arg><arg>${sourcePath}</arg>
|
||||||
<arg>--outputDir</arg><arg>${targetPath}</arg>
|
<arg>--targetPath</arg><arg>${targetPath}</arg>
|
||||||
<arg>--filter</arg><arg>${filter}</arg>
|
<arg>--filter</arg><arg>${filter}</arg>
|
||||||
|
|
||||||
</spark>
|
</spark>
|
||||||
|
|
|
@ -0,0 +1,31 @@
|
||||||
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
|
import org.apache.spark.api.java.JavaSparkContext;
|
||||||
|
import org.apache.spark.sql.Encoders;
|
||||||
|
import org.apache.spark.sql.SparkSession;
|
||||||
|
import scala.Tuple2;
|
||||||
|
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
|
public class SparkGraphImportCounterTest {
|
||||||
|
|
||||||
|
public static List<Tuple2<String, Long>> countEntities(final String inputPath) throws Exception {
|
||||||
|
|
||||||
|
final SparkSession spark = SparkSession
|
||||||
|
.builder()
|
||||||
|
.appName(SparkGraphImportCounterTest.class.getSimpleName())
|
||||||
|
.master("local[*]")
|
||||||
|
.getOrCreate();
|
||||||
|
final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
|
||||||
|
|
||||||
|
return GraphMappingUtils.types.entrySet()
|
||||||
|
.stream()
|
||||||
|
.map(entry -> {
|
||||||
|
final Long count = spark.read().load(inputPath + "/" + entry.getKey()).as(Encoders.bean(entry.getValue())).count();
|
||||||
|
return new Tuple2<String, Long>(entry.getKey(), count);
|
||||||
|
})
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -1,15 +1,39 @@
|
||||||
package eu.dnetlib.dhp.graph;
|
package eu.dnetlib.dhp.graph;
|
||||||
|
|
||||||
import org.junit.Ignore;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.junit.Test;
|
import org.junit.*;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
|
||||||
public class SparkGraphImporterJobTest {
|
public class SparkGraphImporterJobTest {
|
||||||
|
|
||||||
|
private static final long MAX = 1000L;
|
||||||
|
private Path testDir;
|
||||||
|
|
||||||
|
@Before
|
||||||
|
public void setup() throws IOException {
|
||||||
|
testDir = Files.createTempDirectory(getClass().getSimpleName());
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void tearDown() throws IOException {
|
||||||
|
FileUtils.deleteDirectory(testDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@Ignore
|
@Ignore
|
||||||
public void testImport() throws Exception {
|
public void testImport() throws Exception {
|
||||||
SparkGraphImporterJob.main(new String[]{"-mt", "local[*]","-i", "/home/sandro/part-m-02236", "-o", "/tmp/dataframes", "-f", "publication"});
|
SparkGraphImporterJob.main(new String[] {
|
||||||
|
"-mt", "local[*]",
|
||||||
|
"-i", getClass().getResource("/eu/dnetlib/dhp/dhp-sample/part-m-00010").getPath(),
|
||||||
|
"-o", testDir.toString()});
|
||||||
|
|
||||||
|
SparkGraphImportCounterTest.countEntities(testDir.toString()).forEach(t -> {
|
||||||
|
System.out.println(t);
|
||||||
|
//Assert.assertEquals(String.format("mapped %s must be %s", t._1(), MAX), MAX, t._2().longValue());
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue