save as gzipped sequence file

3 years ago · ad56a44fda
parent 4fa5671d16
commit ad56a44fda
4 changed files with 169 additions and 36 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateHdfsMdstoresApplication.java
@ -6,10 +6,14 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.util.Arrays;
 import java.util.Optional;
 import java.util.Set;
+import java.util.UUID;
 import java.util.stream.Collectors;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.StringUtils;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
+import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.http.client.methods.CloseableHttpResponse;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.impl.client.CloseableHttpClient;
@ -29,24 +33,18 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
 import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
+import scala.Tuple2;

 public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication {

 	private static final Logger log = LoggerFactory.getLogger(MigrateHdfsMdstoresApplication.class);

-	private final String mdstoreManagerUrl;
-
-	private final String format;
-
-	private final String layout;
-
-	private final String interpretation;
-
 	public static void main(final String[] args) throws Exception {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
-				.toString(MigrateHdfsMdstoresApplication.class
-					.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_hdfs_mstores_parameters.json")));
+				.toString(
+					MigrateHdfsMdstoresApplication.class
+						.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_hdfs_mstores_parameters.json")));
 		parser.parseArgument(args);

 		final Boolean isSparkSessionManaged = Optional
@ -62,39 +60,51 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication

 		final String hdfsPath = parser.get("hdfsPath");

+		final Set<String> paths = mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation);
+
 		final SparkConf conf = new SparkConf();
 		runWithSparkSession(conf, isSparkSessionManaged, spark -> {
-			try (final MigrateHdfsMdstoresApplication app =
-				new MigrateHdfsMdstoresApplication(hdfsPath, mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation)) {
-				app.execute(spark);
-			}
+			HdfsSupport.remove(hdfsPath, spark.sparkContext().hadoopConfiguration());
+			processPaths(spark, hdfsPath, paths, String.format("%s-%s-%s", mdFormat, mdLayout, mdInterpretation));
 		});
 	}

-	public MigrateHdfsMdstoresApplication(final String hdfsPath, final String mdstoreManagerUrl, final String format, final String layout,
-		final String interpretation) throws Exception {
-		super(hdfsPath);
-		this.mdstoreManagerUrl = mdstoreManagerUrl;
-		this.format = format;
-		this.layout = layout;
-		this.interpretation = interpretation;
-	}
-
-	public void execute(final SparkSession spark) throws Exception {
+	public static void processPaths(final SparkSession spark,
+		final String outputPath,
+		final Set<String> paths,
+		final String type) throws Exception {

 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());

-		final Set<String> paths = mdstorePaths(sc);
 		log.info("Found " + paths.size() + " not empty mdstores");
+		paths.forEach(log::info);

-		spark.read()
-			.parquet(paths.toArray(new String[paths.size()]))
+		final String[] validPaths = paths
+			.stream()
+			.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
+			.toArray(size -> new String[size]);
+
+		spark
+			.read()
+			.parquet(validPaths)
 			.map((MapFunction<Row, String>) r -> r.getAs("body"), Encoders.STRING())
-			.foreach(xml -> emit(xml, String.format("%s-%s-%s", format, layout, interpretation)));
+			.toJavaRDD()
+			.mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml)))
+			.coalesce(1)
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
+
+		/*
+		 * .foreach(xml -> { try { writer.append(new Text(UUID.randomUUID() + ":" + type), new Text(xml)); } catch
+		 * (final Exception e) { throw new RuntimeException(e); } });
+		 */
 	}

-	private Set<String> mdstorePaths(final JavaSparkContext sc) throws Exception {
-		final String url = mdstoreManagerUrl + "/mdstores";
+	private static Set<String> mdstorePaths(final String mdstoreManagerUrl,
+		final String format,
+		final String layout,
+		final String interpretation)
+		throws Exception {
+		final String url = mdstoreManagerUrl + "/mdstores/";
 		final ObjectMapper objectMapper = new ObjectMapper();

 		final HttpGet req = new HttpGet(url);
@ -103,7 +113,8 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
 			try (final CloseableHttpResponse response = client.execute(req)) {
 				final String json = IOUtils.toString(response.getEntity().getContent());
 				final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
-				return Arrays.stream(mdstores)
+				return Arrays
+					.stream(mdstores)
 					.filter(md -> md.getFormat().equalsIgnoreCase(format))
 					.filter(md -> md.getLayout().equalsIgnoreCase(layout))
 					.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
@ -111,10 +122,8 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
 					.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
 					.filter(md -> md.getSize() > 0)
 					.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
-					.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
 					.collect(Collectors.toSet());
 			}
 		}
 	}
-
 }
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/raw/MigrateMongoMdstoresApplication.java
@ -10,9 +10,6 @@ import org.apache.commons.io.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;

-import com.mongodb.MongoClient;
-import com.mongodb.MongoClientURI;
-
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.MdstoreClient;
 import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/config-default.xml
@ -0,0 +1,18 @@
+<configuration>
+    <property>
+        <name>jobTracker</name>
+        <value>yarnRM</value>
+    </property>
+    <property>
+        <name>nameNode</name>
+        <value>hdfs://nameservice1</value>
+    </property>
+    <property>
+        <name>oozie.use.system.libpath</name>
+        <value>true</value>
+    </property>
+    <property>
+        <name>oozie.action.sharelib.for.spark</name>
+        <value>spark2</value>
+    </property>
+</configuration>
--- a/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-graph-mapper/src/main/resources/eu/dnetlib/dhp/oa/graph/raw_hdfs_stores/oozie_app/workflow.xml
@ -0,0 +1,109 @@
+<workflow-app name="Test Import of Hdfs Stores" xmlns="uri:oozie:workflow:0.5">
+
+    <parameters>
+        <property>
+            <name>graphOutputPath</name>
+            <description>the target path to store raw graph</description>
+        </property>
+        <property>
+            <name>contentPath</name>
+            <description>path location to store (or reuse) content from the aggregator</description>
+        </property>
+        <property>
+            <name>mdstoreManagerUrl</name>
+            <description>the address of the Mdstore Manager</description>
+        </property>
+		<property>
+            <name>isLookupUrl</name>
+            <description>the address of the lookUp service</description>
+        </property>
+        <property>
+            <name>sparkDriverMemory</name>
+            <description>memory for driver process</description>
+        </property>
+        <property>
+            <name>sparkExecutorMemory</name>
+            <description>memory for individual executor</description>
+        </property>
+        <property>
+            <name>sparkExecutorCores</name>
+            <description>number of cores used by single executor</description>
+        </property>
+        <property>
+            <name>oozieActionShareLibForSpark2</name>
+            <description>oozie action sharelib for spark 2.*</description>
+        </property>
+        <property>
+            <name>spark2ExtraListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorAppListener</value>
+            <description>spark 2.* extra listeners classname</description>
+        </property>
+        <property>
+            <name>spark2SqlQueryExecutionListeners</name>
+            <value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
+            <description>spark 2.* sql query execution listeners classname</description>
+        </property>
+        <property>
+            <name>spark2YarnHistoryServerAddress</name>
+            <description>spark 2.* yarn history server address</description>
+        </property>
+        <property>
+            <name>spark2EventLogDir</name>
+            <description>spark 2.* event log dir location</description>
+        </property>
+    </parameters>
+
+    <global>
+        <job-tracker>${jobTracker}</job-tracker>
+        <name-node>${nameNode}</name-node>
+        <configuration>
+            <property>
+                <name>mapreduce.job.queuename</name>
+                <value>${queueName}</value>
+            </property>
+            <property>
+                <name>oozie.launcher.mapred.job.queue.name</name>
+                <value>${oozieLauncherQueueName}</value>
+            </property>
+            <property>
+                <name>oozie.action.sharelib.for.spark</name>
+                <value>${oozieActionShareLibForSpark2}</value>
+            </property>
+        </configuration>
+    </global>
+
+    <start to="ImportODF_hdfs"/>
+
+    <kill name="Kill">
+        <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+    </kill>
+    
+    <action name="ImportODF_hdfs">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>ImportODF_hdfs</name>
+            <class>eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication</class>
+            <jar>dhp-graph-mapper-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory ${sparkExecutorMemory}
+                --executor-cores ${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+            </spark-opts>
+            <arg>--hdfsPath</arg><arg>${contentPath}/odf_records_hdfs</arg>
+            <arg>--mdstoreManagerUrl</arg><arg>${mdstoreManagerUrl}</arg>
+            <arg>--mdFormat</arg><arg>ODF</arg>
+            <arg>--mdLayout</arg><arg>store</arg>
+            <arg>--mdInterpretation</arg><arg>cleaned</arg>
+        </spark>
+        <ok to="End"/>
+        <error to="Kill"/>
+    </action>
+
+    
+    <end name="End"/>
+</workflow-app>