1
0
Fork 0

WIP mdstore

transaction implemented on hadoop side
This commit is contained in:
Sandro La Bruzzo 2021-01-29 16:42:41 +01:00
parent d942d0c77d
commit 0276180039
18 changed files with 495 additions and 66 deletions

View File

@ -157,7 +157,9 @@ public class MDStore implements Serializable {
@Override @Override
public String toString() { public String toString() {
return String return String
.format("MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate); .format(
"MDStore [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, hdfsPath=%s, creationDate=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, hdfsPath, creationDate);
} }
@Override @Override
@ -167,8 +169,12 @@ public class MDStore implements Serializable {
@Override @Override
public boolean equals(final Object obj) { public boolean equals(final Object obj) {
if (this == obj) { return true; } if (this == obj) {
if (!(obj instanceof MDStore)) { return false; } return true;
}
if (!(obj instanceof MDStore)) {
return false;
}
final MDStore other = (MDStore) obj; final MDStore other = (MDStore) obj;
return Objects.equals(id, other.id); return Objects.equals(id, other.id);
} }

View File

@ -62,8 +62,12 @@ public class MDStoreCurrentVersion implements Serializable {
@Override @Override
public boolean equals(final Object obj) { public boolean equals(final Object obj) {
if (this == obj) { return true; } if (this == obj) {
if (!(obj instanceof MDStoreCurrentVersion)) { return false; } return true;
}
if (!(obj instanceof MDStoreCurrentVersion)) {
return false;
}
final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj; final MDStoreCurrentVersion other = (MDStoreCurrentVersion) obj;
return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore); return Objects.equals(currentVersion, other.currentVersion) && Objects.equals(mdstore, other.mdstore);
} }

View File

@ -116,7 +116,9 @@ public class MDStoreVersion implements Serializable {
@Override @Override
public String toString() { public String toString() {
return String return String
.format("MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id, mdstore, writing, readCount, lastUpdate, size, hdfsPath); .format(
"MDStoreVersion [id=%s, mdstore=%s, writing=%s, readCount=%s, lastUpdate=%s, size=%s, hdfsPath=%s]", id,
mdstore, writing, readCount, lastUpdate, size, hdfsPath);
} }
@Override @Override
@ -126,8 +128,12 @@ public class MDStoreVersion implements Serializable {
@Override @Override
public boolean equals(final Object obj) { public boolean equals(final Object obj) {
if (this == obj) { return true; } if (this == obj) {
if (!(obj instanceof MDStoreVersion)) { return false; } return true;
}
if (!(obj instanceof MDStoreVersion)) {
return false;
}
final MDStoreVersion other = (MDStoreVersion) obj; final MDStoreVersion other = (MDStoreVersion) obj;
return Objects.equals(id, other.id); return Objects.equals(id, other.id);
} }

View File

@ -168,7 +168,10 @@ public class MDStoreWithInfo implements Serializable {
@Override @Override
public String toString() { public String toString() {
return String return String
.format("MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]", id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate, lastUpdate, size, numberOfVersions, hdfsPath); .format(
"MDStoreWithInfo [id=%s, format=%s, layout=%s, interpretation=%s, datasourceName=%s, datasourceId=%s, apiId=%s, currentVersion=%s, creationDate=%s, lastUpdate=%s, size=%s, numberOfVersions=%s, hdfsPath=%s]",
id, format, layout, interpretation, datasourceName, datasourceId, apiId, currentVersion, creationDate,
lastUpdate, size, numberOfVersions, hdfsPath);
} }
@Override @Override
@ -178,8 +181,12 @@ public class MDStoreWithInfo implements Serializable {
@Override @Override
public boolean equals(final Object obj) { public boolean equals(final Object obj) {
if (this == obj) { return true; } if (this == obj) {
if (!(obj instanceof MDStoreWithInfo)) { return false; } return true;
}
if (!(obj instanceof MDStoreWithInfo)) {
return false;
}
final MDStoreWithInfo other = (MDStoreWithInfo) obj; final MDStoreWithInfo other = (MDStoreWithInfo) obj;
return Objects.equals(id, other.id); return Objects.equals(id, other.id);
} }

View File

@ -1,5 +1,5 @@
package eu.dnetlib.collector.worker.model; package eu.dnetlib.dhp.collector.worker.model;
import java.util.HashMap; import java.util.HashMap;
import java.util.Map; import java.util.Map;

View File

@ -0,0 +1,54 @@
package eu.dnetlib.dhp.common.rest;
import org.apache.commons.io.IOUtils;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import com.fasterxml.jackson.databind.ObjectMapper;
public class DNetRestClient {
private static ObjectMapper mapper = new ObjectMapper();
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet, clazz);
}
public static String doGET(final String url) throws Exception {
final HttpGet httpGet = new HttpGet(url);
return doHTTPRequest(httpGet);
}
public static <V> String doPOST(final String url, V objParam) throws Exception {
final HttpPost httpPost = new HttpPost(url);
if (objParam != null) {
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
httpPost.setEntity(entity);
httpPost.setHeader("Accept", "application/json");
httpPost.setHeader("Content-type", "application/json");
}
return doHTTPRequest(httpPost);
}
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
return mapper.readValue(doPOST(url, objParam), clazz);
}
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
CloseableHttpClient client = HttpClients.createDefault();
CloseableHttpResponse response = client.execute(r);
return IOUtils.toString(response.getEntity().getContent());
}
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
return mapper.readValue(doHTTPRequest(r), clazz);
}
}

View File

@ -0,0 +1,164 @@
package eu.dnetlib.dhp.aggregation.mdstore;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.net.URI;
import java.util.Properties;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.CollectorWorker;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
public class MDStoreActionNode {
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
enum MDAction {
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
}
private static final ObjectMapper mapper = new ObjectMapper();
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
public static final String READ_LOCK_URL = "%s/mdstores/mdstore/%s/startReading";
public static final String READ_UNLOCK_URL = "%s/mdstores/version/%s/endReading";
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
IOUtils
.toString(
CollectorWorker.class
.getResourceAsStream(
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
argumentParser.parseArgument(args);
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
log.info("Curren action is {}", action);
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
switch (action) {
case NEW_VERSION: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREVERSIONPARAM, mapper.writeValueAsString(currentVersion));
break;
}
case COMMIT: {
final String hdfsuri = argumentParser.get("namenode");
if (StringUtils.isBlank(hdfsuri)) {
throw new IllegalArgumentException("missing or empty argument namenode");
}
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
Configuration conf = new Configuration();
// Set FileSystem URI
conf.set("fs.defaultFS", hdfsuri);
// Because of Maven
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
System.setProperty("hadoop.home.dir", "/");
// Get the filesystem - HDFS
FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf);
String mdStoreSizeParam = argumentParser.get("mdStoreSize");
if (StringUtils.isBlank(mdStoreSizeParam)) {
throw new IllegalArgumentException("missing or empty argument mdStoreSize");
}
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + "/size");
FSDataInputStream inputStream = fs.open(hdfstoreSizepath);
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
inputStream.close();
fs.create(hdfstoreSizepath);
DNetRestClient
.doGET(String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
break;
}
case ROLLBACK: {
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
case READ_LOCK: {
final String mdStoreID = argumentParser.get("mdStoreID");
if (StringUtils.isBlank(mdStoreID)) {
throw new IllegalArgumentException("missing or empty argument mdStoreId");
}
final MDStoreVersion currentVersion = DNetRestClient
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
populateOOZIEEnv(MDSTOREREADLOCKPARAM, mapper.writeValueAsString(currentVersion));
break;
}
case READ_UNLOCK: {
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
final MDStoreVersion mdStoreVersion = mapper.readValue(mdStoreVersion_params, MDStoreVersion.class);
if (StringUtils.isBlank(mdStoreVersion.getId())) {
throw new IllegalArgumentException(
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
}
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
break;
}
default:
throw new IllegalArgumentException("invalid action");
}
}
public static void populateOOZIEEnv(final String paramName, String value) throws Exception {
File file = new File(System.getProperty("oozie.action.output.properties"));
Properties props = new Properties();
props.setProperty(paramName, value);
OutputStream os = new FileOutputStream(file);
props.store(os, "");
os.close();
}
}

View File

@ -3,13 +3,17 @@ package eu.dnetlib.dhp.collection;
import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession; import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
import java.io.ByteArrayInputStream; import java.io.*;
import java.nio.charset.StandardCharsets; import java.nio.charset.StandardCharsets;
import java.util.Objects; import java.util.Objects;
import java.util.Optional; import java.util.Optional;
import java.util.Properties;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils; import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Text;
import org.apache.spark.SparkConf; import org.apache.spark.SparkConf;
@ -19,6 +23,7 @@ import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Dataset;
import org.apache.spark.sql.Encoder; import org.apache.spark.sql.Encoder;
import org.apache.spark.sql.Encoders; import org.apache.spark.sql.Encoders;
import org.apache.spark.sql.SaveMode;
import org.apache.spark.util.LongAccumulator; import org.apache.spark.util.LongAccumulator;
import org.dom4j.Document; import org.dom4j.Document;
import org.dom4j.Node; import org.dom4j.Node;
@ -28,7 +33,11 @@ import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
import eu.dnetlib.dhp.model.mdstore.MetadataRecord; import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
import eu.dnetlib.dhp.model.mdstore.Provenance; import eu.dnetlib.dhp.model.mdstore.Provenance;
import eu.dnetlib.message.MessageManager; import eu.dnetlib.message.MessageManager;
@ -36,6 +45,7 @@ import eu.dnetlib.message.MessageManager;
public class GenerateNativeStoreSparkJob { public class GenerateNativeStoreSparkJob {
private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class); private static final Logger log = LoggerFactory.getLogger(GenerateNativeStoreSparkJob.class);
private static final String DATASET_NAME = "/store";
public static void main(String[] args) throws Exception { public static void main(String[] args) throws Exception {
@ -50,11 +60,15 @@ public class GenerateNativeStoreSparkJob {
final String provenanceArgument = parser.get("provenance"); final String provenanceArgument = parser.get("provenance");
log.info("Provenance is {}", provenanceArgument); log.info("Provenance is {}", provenanceArgument);
final Provenance provenance = jsonMapper.readValue(provenanceArgument, Provenance.class); final Provenance provenance = jsonMapper.readValue(provenanceArgument, Provenance.class);
final String dateOfCollectionArgs = parser.get("dateOfCollection"); final String dateOfCollectionArgs = parser.get("dateOfCollection");
log.info("dateOfCollection is {}", dateOfCollectionArgs); log.info("dateOfCollection is {}", dateOfCollectionArgs);
final long dateOfCollection = new Long(dateOfCollectionArgs); final long dateOfCollection = new Long(dateOfCollectionArgs);
final String sequenceFileInputPath = parser.get("input");
log.info("sequenceFileInputPath is {}", dateOfCollectionArgs); String mdStoreVersion = parser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class);
Boolean isSparkSessionManaged = Optional Boolean isSparkSessionManaged = Optional
.ofNullable(parser.get("isSparkSessionManaged")) .ofNullable(parser.get("isSparkSessionManaged"))
@ -70,7 +84,9 @@ public class GenerateNativeStoreSparkJob {
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext()); final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
final JavaPairRDD<IntWritable, Text> inputRDD = sc final JavaPairRDD<IntWritable, Text> inputRDD = sc
.sequenceFile(sequenceFileInputPath, IntWritable.class, Text.class); .sequenceFile(
currentVersion.getHdfsPath() + CollectorWorkerApplication.SEQUENTIAL_FILE_NAME,
IntWritable.class, Text.class);
final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems"); final LongAccumulator totalItems = sc.sc().longAccumulator("TotalItems");
final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords"); final LongAccumulator invalidRecords = sc.sc().longAccumulator("InvalidRecords");
@ -89,12 +105,26 @@ public class GenerateNativeStoreSparkJob {
.distinct(); .distinct();
final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class); final Encoder<MetadataRecord> encoder = Encoders.bean(MetadataRecord.class);
final Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder); Dataset<MetadataRecord> mdstore = spark.createDataset(nativeStore.rdd(), encoder);
final LongAccumulator mdStoreRecords = sc.sc().longAccumulator("MDStoreRecords");
mdStoreRecords.add(mdstore.count());
mdstore.write().format("parquet").save(parser.get("output")); mdstore
.write()
.mode(SaveMode.Overwrite)
.format("parquet")
.save(currentVersion.getHdfsPath() + DATASET_NAME);
mdstore = spark.read().load(currentVersion.getHdfsPath() + DATASET_NAME).as(encoder);
final Long total = mdstore.count();
FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
FSDataOutputStream output = fs.create(new Path(currentVersion.getHdfsPath() + "/size"));
final BufferedOutputStream os = new BufferedOutputStream(output);
os.write(total.toString().getBytes(StandardCharsets.UTF_8));
os.close();
}); });
} }

View File

@ -3,8 +3,8 @@ package eu.dnetlib.dhp.collection.plugin;
import java.util.stream.Stream; import java.util.stream.Stream;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.CollectorException;
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
public interface CollectorPlugin { public interface CollectorPlugin {

View File

@ -13,9 +13,9 @@ import com.google.common.base.Splitter;
import com.google.common.collect.Iterators; import com.google.common.collect.Iterators;
import com.google.common.collect.Lists; import com.google.common.collect.Lists;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.CollectorException; import eu.dnetlib.dhp.collection.worker.CollectorException;
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
public class OaiCollectorPlugin implements CollectorPlugin { public class OaiCollectorPlugin implements CollectorPlugin {

View File

@ -14,12 +14,9 @@ import org.apache.hadoop.io.Text;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.plugin.CollectorPlugin; import eu.dnetlib.dhp.collection.plugin.CollectorPlugin;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
public class CollectorWorker { public class CollectorWorker {

View File

@ -1,15 +1,22 @@
package eu.dnetlib.dhp.collection.worker; package eu.dnetlib.dhp.collection.worker;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.util.Properties;
import org.apache.commons.io.IOUtils; import org.apache.commons.io.IOUtils;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor; import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
import eu.dnetlib.dhp.application.ArgumentApplicationParser; import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.common.rest.DNetRestClient;
/** /**
* DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module * DnetCollectortWorkerApplication is the main class responsible to start the Dnet Collection into HDFS. This module
@ -24,6 +31,8 @@ public class CollectorWorkerApplication {
private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory(); private static final CollectorPluginFactory collectorPluginFactory = new CollectorPluginFactory();
public static String SEQUENTIAL_FILE_NAME = "/sequence_file";
/** /**
* @param args * @param args
*/ */
@ -38,18 +47,23 @@ public class CollectorWorkerApplication {
argumentParser.parseArgument(args); argumentParser.parseArgument(args);
final String hdfsuri = argumentParser.get("namenode"); final String hdfsuri = argumentParser.get("namenode");
log.info("hdfsURI is {}", hdfsuri); log.info("hdfsURI is {}", hdfsuri);
final String hdfsPath = argumentParser.get("hdfsPath");
log.info("hdfsPath is {}" + hdfsPath);
final String apiDescriptor = argumentParser.get("apidescriptor"); final String apiDescriptor = argumentParser.get("apidescriptor");
log.info("apiDescriptor is {}" + apiDescriptor); log.info("apiDescriptor is {}", apiDescriptor);
final String mdStoreVersion = argumentParser.get("mdStoreVersion");
log.info("mdStoreVersion is {}", mdStoreVersion);
final ObjectMapper jsonMapper = new ObjectMapper(); final ObjectMapper jsonMapper = new ObjectMapper();
final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class); final MDStoreVersion currentVersion = jsonMapper.readValue(mdStoreVersion, MDStoreVersion.class);
final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri, hdfsPath); final ApiDescriptor api = jsonMapper.readValue(apiDescriptor, ApiDescriptor.class);
final CollectorWorker worker = new CollectorWorker(collectorPluginFactory, api, hdfsuri,
currentVersion.getHdfsPath() + SEQUENTIAL_FILE_NAME);
worker.collect(); worker.collect();
} }
} }

View File

@ -15,4 +15,9 @@
<name>oozie.action.sharelib.for.spark</name> <name>oozie.action.sharelib.for.spark</name>
<value>spark2</value> <value>spark2</value>
</property> </property>
<property>
<name>oozie.launcher.mapreduce.user.classpath.first</name>
<value>true</value>
</property>
</configuration> </configuration>

View File

@ -30,15 +30,9 @@
"paramRequired": true "paramRequired": true
}, },
{ {
"paramName": "i", "paramName": "mv",
"paramLongName": "input", "paramLongName": "mdStoreVersion",
"paramDescription": "the path of the sequencial file to read", "paramDescription": "the Metadata Store Version Info",
"paramRequired": true
},
{
"paramName": "o",
"paramLongName": "output",
"paramDescription": "the path of the result DataFrame on HDFS",
"paramRequired": true "paramRequired": true
}, },
{ {

View File

@ -1,6 +1,26 @@
[ [
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true}, {
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true}, "paramName": "a",
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true}, "paramLongName": "apidescriptor",
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": false} "paramDescription": "the JSON encoding of the API Descriptor",
"paramRequired": true
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": true
},
{
"paramName": "mv",
"paramLongName": "mdStoreVersion",
"paramDescription": "the MDStore Version bean",
"paramRequired": true
},
{
"paramName": "w",
"paramLongName": "workflowId",
"paramDescription": "the identifier of the dnet Workflow",
"paramRequired": false
}
] ]

View File

@ -0,0 +1,45 @@
[
{
"paramName": "a",
"paramLongName": "action",
"paramDescription": "the JSON encoding of the API Descriptor",
"paramRequired": true
},
{
"paramName": "mu",
"paramLongName": "mdStoreManagerURI",
"paramDescription": "the MDStore Manager URI",
"paramRequired": true
},
{
"paramName": "mi",
"paramLongName": "mdStoreID",
"paramDescription": "the Metadata Store ID",
"paramRequired": false
},
{
"paramName": "ms",
"paramLongName": "mdStoreSize",
"paramDescription": "the Metadata Store Size",
"paramRequired": false
},
{
"paramName": "mv",
"paramLongName": "mdStoreVersion",
"paramDescription": "the Metadata Version Bean",
"paramRequired": false
},
{
"paramName": "n",
"paramLongName": "namenode",
"paramDescription": "the Name Node URI",
"paramRequired": false
},
{
"paramName": "rm",
"paramLongName": "readMDStoreId",
"paramDescription": "the ID Locked to Read",
"paramRequired": false
}
]

View File

@ -1,10 +1,5 @@
<workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5"> <workflow-app name="CollectionWorkflow" xmlns="uri:oozie:workflow:0.5">
<parameters> <parameters>
<property>
<name>mdStorePath</name>
<description>the path of the native mdstore</description>
</property>
<property> <property>
<name>apiDescription</name> <name>apiDescription</name>
<description>A json encoding of the API Description class</description> <description>A json encoding of the API Description class</description>
@ -16,7 +11,7 @@
</property> </property>
<property> <property>
<name>identifierPath</name> <name>identifierPath</name>
<description>An xpath to retrieve the metadata idnentifier for the generation of DNet Identifier </description> <description>An xpath to retrieve the metadata identifier for the generation of DNet Identifier </description>
</property> </property>
<property> <property>
@ -33,26 +28,78 @@
<name>workflowId</name> <name>workflowId</name>
<description>The identifier of the workflow</description> <description>The identifier of the workflow</description>
</property> </property>
<property>
<name>mdStoreID</name>
<description>The identifier of the mdStore</description>
</property>
<property>
<name>mdStoreManagerURI</name>
<description>The URI of the MDStore Manager</description>
</property>
<property>
<name>collectionMode</name>
<description>Should be Refresh or Incremental</description>
</property>
</parameters> </parameters>
<global> <global>
<job-tracker>${jobTracker}</job-tracker> <job-tracker>${jobTracker}</job-tracker>
<name-node>${nameNode}</name-node> <name-node>${nameNode}</name-node>
</global> </global>
<start to="CollectionWorker"/>
<start to="StartTransaction"/>
<kill name="Kill"> <kill name="Kill">
<message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message> <message>Action failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
</kill> </kill>
<decision name="collection_mode">
<switch>
<case to="StartTransaction">${wf:conf('collectionMode') eq 'REFRESH'}</case>
<case to="BeginRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
<default to="ImportDatacite"/>
</switch>
</decision>
<action name="BeginRead">
<java>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>READ_LOCK</arg>
<arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="StartTransaction"/>
<error to="Kill"/>
</action>
<action name="StartTransaction">
<java>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>NEW_VERSION</arg>
<arg>--mdStoreID</arg><arg>${mdStoreID}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<capture-output/>
</java>
<ok to="CollectionWorker"/>
<error to="Kill"/>
</action>
<action name="CollectionWorker"> <action name="CollectionWorker">
<java> <java>
<main-class>eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication</main-class> <main-class>eu.dnetlib.dhp.collection.worker.CollectorWorkerApplication</main-class>
<arg>--hdfsPath</arg><arg>${workingDir}/sequenceFile_${mdstoreVersion}</arg>
<arg>--apidescriptor</arg><arg>${apiDescription}</arg> <arg>--apidescriptor</arg><arg>${apiDescription}</arg>
<arg>--namenode</arg><arg>${nameNode}</arg> <arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
</java> </java>
<ok to="GenerateNativeStoreSparkJob"/> <ok to="GenerateNativeStoreSparkJob"/>
<error to="Kill"/> <error to="RollBack"/>
</action> </action>
<action name="GenerateNativeStoreSparkJob"> <action name="GenerateNativeStoreSparkJob">
@ -75,13 +122,56 @@
<arg>--dateOfCollection</arg><arg>${timestamp}</arg> <arg>--dateOfCollection</arg><arg>${timestamp}</arg>
<arg>--provenance</arg><arg>${dataSourceInfo}</arg> <arg>--provenance</arg><arg>${dataSourceInfo}</arg>
<arg>--xpath</arg><arg>${identifierPath}</arg> <arg>--xpath</arg><arg>${identifierPath}</arg>
<arg>--input</arg><arg>${workingDir}/sequenceFile</arg> <arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--output</arg><arg>${mdStorePath}</arg>
<arg>-w</arg><arg>${workflowId}</arg>
</spark> </spark>
<ok to="collection_mode_end"/>
<error to="RollBack"/>
</action>
<decision name="collection_mode_end">
<switch>
<case to="CommitVersion">${wf:conf('collectionMode') eq 'REFRESH'}</case>
<case to="EndRead">${wf:conf('collectionMode') eq 'INCREMENTAL'}</case>
<default to="ImportDatacite"/>
</switch>
</decision>
<action name="EndRead">
<java>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>READ_UNLOCK</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
<arg>--readMDStoreId</arg><arg>${wf:actionData('BeginRead')['mdStoreReadLockVersion']}</arg>
<capture-output/>
</java>
<ok to="CommitVersion"/>
<error to="Kill"/>
</action>
<action name="CommitVersion">
<java>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--namenode</arg><arg>${nameNode}</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('StartTransaction')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="End"/> <ok to="End"/>
<error to="Kill"/> <error to="Kill"/>
</action> </action>
<action name="RollBack">
<java>
<main-class>eu.dnetlib.dhp.aggregation.mdstore.MDStoreActionNode</main-class>
<arg>--action</arg><arg>COMMIT</arg>
<arg>--mdStoreVersion</arg><arg>${wf:actionData('CollectionWorker')['mdStoreVersion']}</arg>
<arg>--mdStoreManagerURI</arg><arg>${mdStoreManagerURI}</arg>
</java>
<ok to="Kill"/>
<error to="Kill"/>
</action>
<end name="End"/> <end name="End"/>
</workflow-app> </workflow-app>

View File

@ -2,25 +2,18 @@
package eu.dnetlib.dhp.collector.worker; package eu.dnetlib.dhp.collector.worker;
import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.mockito.Mockito.*;
import java.io.File;
import java.nio.file.Path; import java.nio.file.Path;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Disabled;
import org.junit.jupiter.api.Test; import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir; import org.junit.jupiter.api.io.TempDir;
import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectMapper;
import eu.dnetlib.collector.worker.model.ApiDescriptor;
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
import eu.dnetlib.dhp.collection.worker.CollectorWorker; import eu.dnetlib.dhp.collection.worker.CollectorWorker;
import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory; import eu.dnetlib.dhp.collection.worker.utils.CollectorPluginFactory;
import eu.dnetlib.message.Message; import eu.dnetlib.dhp.collector.worker.model.ApiDescriptor;
import eu.dnetlib.message.MessageManager;
@Disabled @Disabled
public class DnetCollectorWorkerApplicationTests { public class DnetCollectorWorkerApplicationTests {