forked from D-Net/dnet-hadoop
save as gzipped sequence file
This commit is contained in:
parent
4fa5671d16
commit
ad56a44fda
|
@ -6,10 +6,14 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
|
|||
import java.util.Arrays;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.UUID;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.hadoop.io.compress.GzipCodec;
|
||||
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
|
@ -29,24 +33,18 @@ import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
|||
import eu.dnetlib.dhp.common.HdfsSupport;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
||||
import scala.Tuple2;
|
||||
|
||||
public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(MigrateHdfsMdstoresApplication.class);
|
||||
|
||||
private final String mdstoreManagerUrl;
|
||||
|
||||
private final String format;
|
||||
|
||||
private final String layout;
|
||||
|
||||
private final String interpretation;
|
||||
|
||||
public static void main(final String[] args) throws Exception {
|
||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(MigrateHdfsMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_hdfs_mstores_parameters.json")));
|
||||
.toString(
|
||||
MigrateHdfsMdstoresApplication.class
|
||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_hdfs_mstores_parameters.json")));
|
||||
parser.parseArgument(args);
|
||||
|
||||
final Boolean isSparkSessionManaged = Optional
|
||||
|
@ -62,39 +60,51 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
|
|||
|
||||
final String hdfsPath = parser.get("hdfsPath");
|
||||
|
||||
final Set<String> paths = mdstorePaths(mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation);
|
||||
|
||||
final SparkConf conf = new SparkConf();
|
||||
runWithSparkSession(conf, isSparkSessionManaged, spark -> {
|
||||
try (final MigrateHdfsMdstoresApplication app =
|
||||
new MigrateHdfsMdstoresApplication(hdfsPath, mdstoreManagerUrl, mdFormat, mdLayout, mdInterpretation)) {
|
||||
app.execute(spark);
|
||||
}
|
||||
HdfsSupport.remove(hdfsPath, spark.sparkContext().hadoopConfiguration());
|
||||
processPaths(spark, hdfsPath, paths, String.format("%s-%s-%s", mdFormat, mdLayout, mdInterpretation));
|
||||
});
|
||||
}
|
||||
|
||||
public MigrateHdfsMdstoresApplication(final String hdfsPath, final String mdstoreManagerUrl, final String format, final String layout,
|
||||
final String interpretation) throws Exception {
|
||||
super(hdfsPath);
|
||||
this.mdstoreManagerUrl = mdstoreManagerUrl;
|
||||
this.format = format;
|
||||
this.layout = layout;
|
||||
this.interpretation = interpretation;
|
||||
}
|
||||
|
||||
public void execute(final SparkSession spark) throws Exception {
|
||||
public static void processPaths(final SparkSession spark,
|
||||
final String outputPath,
|
||||
final Set<String> paths,
|
||||
final String type) throws Exception {
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
|
||||
final Set<String> paths = mdstorePaths(sc);
|
||||
log.info("Found " + paths.size() + " not empty mdstores");
|
||||
paths.forEach(log::info);
|
||||
|
||||
spark.read()
|
||||
.parquet(paths.toArray(new String[paths.size()]))
|
||||
final String[] validPaths = paths
|
||||
.stream()
|
||||
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
|
||||
.toArray(size -> new String[size]);
|
||||
|
||||
spark
|
||||
.read()
|
||||
.parquet(validPaths)
|
||||
.map((MapFunction<Row, String>) r -> r.getAs("body"), Encoders.STRING())
|
||||
.foreach(xml -> emit(xml, String.format("%s-%s-%s", format, layout, interpretation)));
|
||||
.toJavaRDD()
|
||||
.mapToPair(xml -> new Tuple2<>(new Text(UUID.randomUUID() + ":" + type), new Text(xml)))
|
||||
.coalesce(1)
|
||||
.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
|
||||
|
||||
/*
|
||||
* .foreach(xml -> { try { writer.append(new Text(UUID.randomUUID() + ":" + type), new Text(xml)); } catch
|
||||
* (final Exception e) { throw new RuntimeException(e); } });
|
||||
*/
|
||||
}
|
||||
|
||||
private Set<String> mdstorePaths(final JavaSparkContext sc) throws Exception {
|
||||
final String url = mdstoreManagerUrl + "/mdstores";
|
||||
private static Set<String> mdstorePaths(final String mdstoreManagerUrl,
|
||||
final String format,
|
||||
final String layout,
|
||||
final String interpretation)
|
||||
throws Exception {
|
||||
final String url = mdstoreManagerUrl + "/mdstores/";
|
||||
final ObjectMapper objectMapper = new ObjectMapper();
|
||||
|
||||
final HttpGet req = new HttpGet(url);
|
||||
|
@ -103,7 +113,8 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
|
|||
try (final CloseableHttpResponse response = client.execute(req)) {
|
||||
final String json = IOUtils.toString(response.getEntity().getContent());
|
||||
final MDStoreWithInfo[] mdstores = objectMapper.readValue(json, MDStoreWithInfo[].class);
|
||||
return Arrays.stream(mdstores)
|
||||
return Arrays
|
||||
.stream(mdstores)
|
||||
.filter(md -> md.getFormat().equalsIgnoreCase(format))
|
||||
.filter(md -> md.getLayout().equalsIgnoreCase(layout))
|
||||
.filter(md -> md.getInterpretation().equalsIgnoreCase(interpretation))
|
||||
|
@ -111,10 +122,8 @@ public class MigrateHdfsMdstoresApplication extends AbstractMigrationApplication
|
|||
.filter(md -> StringUtils.isNotBlank(md.getCurrentVersion()))
|
||||
.filter(md -> md.getSize() > 0)
|
||||
.map(md -> md.getHdfsPath() + "/" + md.getCurrentVersion() + "/store")
|
||||
.filter(p -> HdfsSupport.exists(p, sc.hadoopConfiguration()))
|
||||
.collect(Collectors.toSet());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -10,9 +10,6 @@ import org.apache.commons.io.IOUtils;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.mongodb.MongoClient;
|
||||
import com.mongodb.MongoClientURI;
|
||||
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.MdstoreClient;
|
||||
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
<configuration>
|
||||
<property>
|
||||
<name>jobTracker</name>
|
||||
<value>yarnRM</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>nameNode</name>
|
||||
<value>hdfs://nameservice1</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.use.system.libpath</name>
|
||||
<value>true</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>spark2</value>
|
||||
</property>
|
||||
</configuration>
|
|
@ -0,0 +1,109 @@
|
|||
<workflow-app name="Test Import of Hdfs Stores" xmlns="uri:oozie:workflow:0.5">
|
||||
|
||||
<parameters>
|
||||
<property>
|
||||
<name>graphOutputPath</name>
|
||||
<description>the target path to store raw graph</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>contentPath</name>
|
||||
<description>path location to store (or reuse) content from the aggregator</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>mdstoreManagerUrl</name>
|
||||
<description>the address of the Mdstore Manager</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>isLookupUrl</name>
|
||||
<description>the address of the lookUp service</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkDriverMemory</name>
|
||||
<description>memory for driver process</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorMemory</name>
|
||||
<description>memory for individual executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>sparkExecutorCores</name>
|
||||
<description>number of cores used by single executor</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozieActionShareLibForSpark2</name>
|
||||
<description>oozie action sharelib for spark 2.*</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2ExtraListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorAppListener</value>
|
||||
<description>spark 2.* extra listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2SqlQueryExecutionListeners</name>
|
||||
<value>com.cloudera.spark.lineage.NavigatorQueryListener</value>
|
||||
<description>spark 2.* sql query execution listeners classname</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2YarnHistoryServerAddress</name>
|
||||
<description>spark 2.* yarn history server address</description>
|
||||
</property>
|
||||
<property>
|
||||
<name>spark2EventLogDir</name>
|
||||
<description>spark 2.* event log dir location</description>
|
||||
</property>
|
||||
</parameters>
|
||||
|
||||
<global>
|
||||
<job-tracker>${jobTracker}</job-tracker>
|
||||
<name-node>${nameNode}</name-node>
|
||||
<configuration>
|
||||
<property>
|
||||
<name>mapreduce.job.queuename</name>
|
||||
<value>${queueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.launcher.mapred.job.queue.name</name>
|
||||
<value>${oozieLauncherQueueName}</value>
|
||||
</property>
|
||||
<property>
|
||||
<name>oozie.action.sharelib.for.spark</name>
|
||||
<value>${oozieActionShareLibForSpark2}</value>
|
||||
</property>
|
||||
</configuration>
|
||||
</global>
|
||||
|
||||
<start to="ImportODF_hdfs"/>
|
||||
|
||||
<kill name="Kill">
|
||||
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
|
||||
</kill>
|
||||
|
||||
<action name="ImportODF_hdfs">
|
||||
<spark xmlns="uri:oozie:spark-action:0.2">
|
||||
<master>yarn</master>
|
||||
<mode>cluster</mode>
|
||||
<name>ImportODF_hdfs</name>
|
||||
<class>eu.dnetlib.dhp.oa.graph.raw.MigrateHdfsMdstoresApplication</class>
|
||||
<jar>dhp-graph-mapper-${projectVersion}.jar</jar>
|
||||
<spark-opts>
|
||||
--executor-memory ${sparkExecutorMemory}
|
||||
--executor-cores ${sparkExecutorCores}
|
||||
--driver-memory=${sparkDriverMemory}
|
||||
--conf spark.extraListeners=${spark2ExtraListeners}
|
||||
--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
|
||||
--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
|
||||
--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
|
||||
</spark-opts>
|
||||
<arg>--hdfsPath</arg><arg>${contentPath}/odf_records_hdfs</arg>
|
||||
<arg>--mdstoreManagerUrl</arg><arg>${mdstoreManagerUrl}</arg>
|
||||
<arg>--mdFormat</arg><arg>ODF</arg>
|
||||
<arg>--mdLayout</arg><arg>store</arg>
|
||||
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
||||
</spark>
|
||||
<ok to="End"/>
|
||||
<error to="Kill"/>
|
||||
</action>
|
||||
|
||||
|
||||
<end name="End"/>
|
||||
</workflow-app>
|
Loading…
Reference in New Issue