Merge branch 'hadoop_aggregator' of code-repo.d4science.org:D-Net/dnet-hadoop into hadoop_aggregator
commit
33f4696d6e
@ -0,0 +1,21 @@
|
||||
|
||||
package eu.dnetlib.dhp.application;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.util.Properties;
|
||||
|
||||
public class ApplicationUtils {
|
||||
|
||||
public static void populateOOZIEEnv(final String paramName, String value) throws Exception {
|
||||
File file = new File(System.getProperty("oozie.action.output.properties"));
|
||||
Properties props = new Properties();
|
||||
|
||||
props.setProperty(paramName, value);
|
||||
OutputStream os = new FileOutputStream(file);
|
||||
props.store(os, "");
|
||||
os.close();
|
||||
}
|
||||
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
|
||||
package eu.dnetlib.collector.worker.model;
|
||||
package eu.dnetlib.dhp.collector.worker.model;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
@ -0,0 +1,54 @@
|
||||
|
||||
package eu.dnetlib.dhp.common.rest;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
||||
import org.apache.http.client.methods.HttpGet;
|
||||
import org.apache.http.client.methods.HttpPost;
|
||||
import org.apache.http.client.methods.HttpUriRequest;
|
||||
import org.apache.http.entity.StringEntity;
|
||||
import org.apache.http.impl.client.CloseableHttpClient;
|
||||
import org.apache.http.impl.client.HttpClients;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class DNetRestClient {
|
||||
|
||||
private static ObjectMapper mapper = new ObjectMapper();
|
||||
|
||||
public static <T> T doGET(final String url, Class<T> clazz) throws Exception {
|
||||
final HttpGet httpGet = new HttpGet(url);
|
||||
return doHTTPRequest(httpGet, clazz);
|
||||
}
|
||||
|
||||
public static String doGET(final String url) throws Exception {
|
||||
final HttpGet httpGet = new HttpGet(url);
|
||||
return doHTTPRequest(httpGet);
|
||||
}
|
||||
|
||||
public static <V> String doPOST(final String url, V objParam) throws Exception {
|
||||
final HttpPost httpPost = new HttpPost(url);
|
||||
|
||||
if (objParam != null) {
|
||||
final StringEntity entity = new StringEntity(mapper.writeValueAsString(objParam));
|
||||
httpPost.setEntity(entity);
|
||||
httpPost.setHeader("Accept", "application/json");
|
||||
httpPost.setHeader("Content-type", "application/json");
|
||||
}
|
||||
return doHTTPRequest(httpPost);
|
||||
}
|
||||
|
||||
public static <T, V> T doPOST(final String url, V objParam, Class<T> clazz) throws Exception {
|
||||
return mapper.readValue(doPOST(url, objParam), clazz);
|
||||
}
|
||||
|
||||
private static String doHTTPRequest(final HttpUriRequest r) throws Exception {
|
||||
CloseableHttpClient client = HttpClients.createDefault();
|
||||
CloseableHttpResponse response = client.execute(r);
|
||||
return IOUtils.toString(response.getEntity().getContent());
|
||||
}
|
||||
|
||||
private static <T> T doHTTPRequest(final HttpUriRequest r, Class<T> clazz) throws Exception {
|
||||
return mapper.readValue(doHTTPRequest(r), clazz);
|
||||
}
|
||||
}
|
@ -1,76 +0,0 @@
|
||||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Map;
|
||||
|
||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
public class Message {
|
||||
|
||||
private String workflowId;
|
||||
|
||||
private String jobName;
|
||||
|
||||
private MessageType type;
|
||||
|
||||
private Map<String, String> body;
|
||||
|
||||
public static Message fromJson(final String json) throws IOException {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
return jsonMapper.readValue(json, Message.class);
|
||||
}
|
||||
|
||||
public Message() {
|
||||
}
|
||||
|
||||
public Message(String workflowId, String jobName, MessageType type, Map<String, String> body) {
|
||||
this.workflowId = workflowId;
|
||||
this.jobName = jobName;
|
||||
this.type = type;
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
public String getWorkflowId() {
|
||||
return workflowId;
|
||||
}
|
||||
|
||||
public void setWorkflowId(String workflowId) {
|
||||
this.workflowId = workflowId;
|
||||
}
|
||||
|
||||
public String getJobName() {
|
||||
return jobName;
|
||||
}
|
||||
|
||||
public void setJobName(String jobName) {
|
||||
this.jobName = jobName;
|
||||
}
|
||||
|
||||
public MessageType getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public void setType(MessageType type) {
|
||||
this.type = type;
|
||||
}
|
||||
|
||||
public Map<String, String> getBody() {
|
||||
return body;
|
||||
}
|
||||
|
||||
public void setBody(Map<String, String> body) {
|
||||
this.body = body;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
final ObjectMapper jsonMapper = new ObjectMapper();
|
||||
try {
|
||||
return jsonMapper.writeValueAsString(this);
|
||||
} catch (JsonProcessingException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
@ -1,47 +0,0 @@
|
||||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
|
||||
import com.rabbitmq.client.AMQP;
|
||||
import com.rabbitmq.client.Channel;
|
||||
import com.rabbitmq.client.DefaultConsumer;
|
||||
import com.rabbitmq.client.Envelope;
|
||||
|
||||
public class MessageConsumer extends DefaultConsumer {
|
||||
|
||||
final LinkedBlockingQueue<Message> queueMessages;
|
||||
|
||||
/**
|
||||
* Constructs a new instance and records its association to the passed-in channel.
|
||||
*
|
||||
* @param channel the channel to which this consumer is attached
|
||||
* @param queueMessages
|
||||
*/
|
||||
public MessageConsumer(Channel channel, LinkedBlockingQueue<Message> queueMessages) {
|
||||
super(channel);
|
||||
this.queueMessages = queueMessages;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void handleDelivery(
|
||||
String consumerTag, Envelope envelope, AMQP.BasicProperties properties, byte[] body)
|
||||
throws IOException {
|
||||
final String json = new String(body, StandardCharsets.UTF_8);
|
||||
Message message = Message.fromJson(json);
|
||||
try {
|
||||
this.queueMessages.put(message);
|
||||
System.out.println("Receiving Message " + message);
|
||||
} catch (InterruptedException e) {
|
||||
if (message.getType() == MessageType.REPORT)
|
||||
throw new RuntimeException("Error on sending message");
|
||||
else {
|
||||
// TODO LOGGING EXCEPTION
|
||||
}
|
||||
} finally {
|
||||
getChannel().basicAck(envelope.getDeliveryTag(), false);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,136 +0,0 @@
|
||||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeoutException;
|
||||
|
||||
import com.rabbitmq.client.Channel;
|
||||
import com.rabbitmq.client.Connection;
|
||||
import com.rabbitmq.client.ConnectionFactory;
|
||||
|
||||
public class MessageManager {
|
||||
|
||||
private final String messageHost;
|
||||
|
||||
private final String username;
|
||||
|
||||
private final String password;
|
||||
|
||||
private Connection connection;
|
||||
|
||||
private final Map<String, Channel> channels = new HashMap<>();
|
||||
|
||||
private boolean durable;
|
||||
|
||||
private boolean autodelete;
|
||||
|
||||
private final LinkedBlockingQueue<Message> queueMessages;
|
||||
|
||||
public MessageManager(
|
||||
String messageHost,
|
||||
String username,
|
||||
String password,
|
||||
final LinkedBlockingQueue<Message> queueMessages) {
|
||||
this.queueMessages = queueMessages;
|
||||
this.messageHost = messageHost;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
}
|
||||
|
||||
public MessageManager(
|
||||
String messageHost,
|
||||
String username,
|
||||
String password,
|
||||
boolean durable,
|
||||
boolean autodelete,
|
||||
final LinkedBlockingQueue<Message> queueMessages) {
|
||||
this.queueMessages = queueMessages;
|
||||
this.messageHost = messageHost;
|
||||
this.username = username;
|
||||
this.password = password;
|
||||
|
||||
this.durable = durable;
|
||||
this.autodelete = autodelete;
|
||||
}
|
||||
|
||||
private Connection createConnection() throws IOException, TimeoutException {
|
||||
ConnectionFactory factory = new ConnectionFactory();
|
||||
factory.setHost(this.messageHost);
|
||||
factory.setUsername(this.username);
|
||||
factory.setPassword(this.password);
|
||||
return factory.newConnection();
|
||||
}
|
||||
|
||||
private Channel createChannel(
|
||||
final Connection connection,
|
||||
final String queueName,
|
||||
final boolean durable,
|
||||
final boolean autodelete)
|
||||
throws Exception {
|
||||
Map<String, Object> args = new HashMap<>();
|
||||
args.put("x-message-ttl", 10000);
|
||||
Channel channel = connection.createChannel();
|
||||
channel.queueDeclare(queueName, durable, false, this.autodelete, args);
|
||||
return channel;
|
||||
}
|
||||
|
||||
private Channel getOrCreateChannel(final String queueName, boolean durable, boolean autodelete)
|
||||
throws Exception {
|
||||
if (channels.containsKey(queueName)) {
|
||||
return channels.get(queueName);
|
||||
}
|
||||
|
||||
if (this.connection == null) {
|
||||
this.connection = createConnection();
|
||||
}
|
||||
channels.put(queueName, createChannel(this.connection, queueName, durable, autodelete));
|
||||
return channels.get(queueName);
|
||||
}
|
||||
|
||||
public void close() throws IOException {
|
||||
channels
|
||||
.values()
|
||||
.forEach(
|
||||
ch -> {
|
||||
try {
|
||||
ch.close();
|
||||
} catch (Exception e) {
|
||||
// TODO LOG
|
||||
}
|
||||
});
|
||||
|
||||
this.connection.close();
|
||||
}
|
||||
|
||||
public boolean sendMessage(final Message message, String queueName) throws Exception {
|
||||
try {
|
||||
Channel channel = getOrCreateChannel(queueName, this.durable, this.autodelete);
|
||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
||||
return true;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public boolean sendMessage(
|
||||
final Message message, String queueName, boolean durable_var, boolean autodelete_var)
|
||||
throws Exception {
|
||||
try {
|
||||
Channel channel = getOrCreateChannel(queueName, durable_var, autodelete_var);
|
||||
channel.basicPublish("", queueName, null, message.toString().getBytes());
|
||||
return true;
|
||||
} catch (Throwable e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
public void startConsumingMessage(
|
||||
final String queueName, final boolean durable, final boolean autodelete) throws Exception {
|
||||
|
||||
Channel channel = createChannel(createConnection(), queueName, durable, autodelete);
|
||||
channel.basicConsume(queueName, false, new MessageConsumer(channel, queueMessages));
|
||||
}
|
||||
}
|
@ -1,6 +0,0 @@
|
||||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
public enum MessageType {
|
||||
ONGOING, REPORT
|
||||
}
|
@ -1,51 +0,0 @@
|
||||
|
||||
package eu.dnetlib.message;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class MessageTest {
|
||||
|
||||
@Test
|
||||
public void fromJsonTest() throws IOException {
|
||||
Message m = new Message();
|
||||
m.setWorkflowId("wId");
|
||||
m.setType(MessageType.ONGOING);
|
||||
m.setJobName("Collection");
|
||||
Map<String, String> body = new HashMap<>();
|
||||
body.put("parsedItem", "300");
|
||||
body.put("ExecutionTime", "30s");
|
||||
|
||||
m.setBody(body);
|
||||
System.out.println("m = " + m);
|
||||
Message m1 = Message.fromJson(m.toString());
|
||||
assertEquals(m1.getWorkflowId(), m.getWorkflowId());
|
||||
assertEquals(m1.getType(), m.getType());
|
||||
assertEquals(m1.getJobName(), m.getJobName());
|
||||
|
||||
assertNotNull(m1.getBody());
|
||||
m1.getBody().keySet().forEach(it -> assertEquals(m1.getBody().get(it), m.getBody().get(it)));
|
||||
assertEquals(m1.getJobName(), m.getJobName());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void toStringTest() {
|
||||
final String expectedJson = "{\"workflowId\":\"wId\",\"jobName\":\"Collection\",\"type\":\"ONGOING\",\"body\":{\"ExecutionTime\":\"30s\",\"parsedItem\":\"300\"}}";
|
||||
Message m = new Message();
|
||||
m.setWorkflowId("wId");
|
||||
m.setType(MessageType.ONGOING);
|
||||
m.setJobName("Collection");
|
||||
Map<String, String> body = new HashMap<>();
|
||||
body.put("parsedItem", "300");
|
||||
body.put("ExecutionTime", "30s");
|
||||
|
||||
m.setBody(body);
|
||||
|
||||
assertEquals(expectedJson, m.toString());
|
||||
}
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
public class AggregationConstants {
|
||||
|
||||
public static final String SEQUENCE_FILE_NAME = "/sequence_file";
|
||||
public static final String MDSTORE_DATA_PATH = "/store";
|
||||
public static final String MDSTORE_SIZE_PATH = "/size";
|
||||
|
||||
public static final String CONTENT_TOTALITEMS = "TotalItems";
|
||||
public static final String CONTENT_INVALIDRECORDS = "InvalidRecords";
|
||||
public static final String CONTENT_TRANSFORMEDRECORDS = "transformedItems";
|
||||
|
||||
}
|
@ -0,0 +1,49 @@
|
||||
|
||||
package eu.dnetlib.dhp.aggregation.common;
|
||||
|
||||
import java.io.BufferedOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
|
||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.SaveMode;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
|
||||
public class AggregationUtility {
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AggregationUtility.class);
|
||||
|
||||
public static final ObjectMapper MAPPER = new ObjectMapper();
|
||||
|
||||
public static void writeTotalSizeOnHDFS(final SparkSession spark, final Long total, final String path)
|
||||
throws IOException {
|
||||
|
||||
log.info("writing size ({}) info file {}", total, path);
|
||||
try (FileSystem fs = FileSystem.get(spark.sparkContext().hadoopConfiguration());
|
||||
BufferedOutputStream os = new BufferedOutputStream(fs.create(new Path(path)))) {
|
||||
os.write(total.toString().getBytes(StandardCharsets.UTF_8));
|
||||
os.flush();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public static <T> void saveDataset(final Dataset<T> mdstore, final String targetPath) {
|
||||
log.info("saving dataset in: {}", targetPath);
|
||||
mdstore
|
||||
.write()
|
||||
.mode(SaveMode.Overwrite)
|
||||
.format("parquet")
|
||||
.save(targetPath);
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,148 @@
|
||||
|
||||
package eu.dnetlib.dhp.aggregation.mdstore;
|
||||
|
||||
import static eu.dnetlib.dhp.aggregation.common.AggregationUtility.*;
|
||||
import static eu.dnetlib.dhp.application.ApplicationUtils.*;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.OutputStream;
|
||||
import java.net.URI;
|
||||
import java.util.Properties;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FSDataInputStream;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||
import eu.dnetlib.dhp.common.rest.DNetRestClient;
|
||||
|
||||
public class MDStoreActionNode {
|
||||
private static final Logger log = LoggerFactory.getLogger(MDStoreActionNode.class);
|
||||
|
||||
enum MDAction {
|
||||
NEW_VERSION, ROLLBACK, COMMIT, READ_LOCK, READ_UNLOCK
|
||||
}
|
||||
|
||||
public static String NEW_VERSION_URI = "%s/mdstore/%s/newVersion";
|
||||
|
||||
public static final String COMMIT_VERSION_URL = "%s/version/%s/commit/%s";
|
||||
public static final String ROLLBACK_VERSION_URL = "%s/version/%s/abort";
|
||||
|
||||
public static final String READ_LOCK_URL = "%s/mdstore/%s/startReading";
|
||||
public static final String READ_UNLOCK_URL = "%s/version/%s/endReading";
|
||||
|
||||
private static final String MDSTOREVERSIONPARAM = "mdStoreVersion";
|
||||
private static final String MDSTOREREADLOCKPARAM = "mdStoreReadLockVersion";
|
||||
|
||||
public static void main(String[] args) throws Exception {
|
||||
final ArgumentApplicationParser argumentParser = new ArgumentApplicationParser(
|
||||
IOUtils
|
||||
.toString(
|
||||
MDStoreActionNode.class
|
||||
.getResourceAsStream(
|
||||
"/eu/dnetlib/dhp/collection/mdstore_action_parameters.json")));
|
||||
argumentParser.parseArgument(args);
|
||||
|
||||
final MDAction action = MDAction.valueOf(argumentParser.get("action"));
|
||||
log.info("Current action is {}", action);
|
||||
|
||||
final String mdStoreManagerURI = argumentParser.get("mdStoreManagerURI");
|
||||
log.info("mdStoreManagerURI is {}", mdStoreManagerURI);
|
||||
|
||||
switch (action) {
|
||||
case NEW_VERSION: {
|
||||
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||
if (StringUtils.isBlank(mdStoreID)) {
|
||||
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||
}
|
||||
final MDStoreVersion currentVersion = DNetRestClient
|
||||
.doGET(String.format(NEW_VERSION_URI, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||
populateOOZIEEnv(MDSTOREVERSIONPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||
break;
|
||||
}
|
||||
case COMMIT: {
|
||||
|
||||
final String hdfsuri = argumentParser.get("namenode");
|
||||
if (StringUtils.isBlank(hdfsuri)) {
|
||||
throw new IllegalArgumentException("missing or empty argument namenode");
|
||||
}
|
||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
|
||||
Configuration conf = new Configuration();
|
||||
// Set FileSystem URI
|
||||
conf.set("fs.defaultFS", hdfsuri);
|
||||
// Because of Maven
|
||||
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
||||
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
||||
|
||||
System.setProperty("hadoop.home.dir", "/");
|
||||
// Get the filesystem - HDFS
|
||||
FileSystem fs = FileSystem.get(URI.create(hdfsuri), conf);
|
||||
|
||||
Path hdfstoreSizepath = new Path(mdStoreVersion.getHdfsPath() + "/size");
|
||||
|
||||
FSDataInputStream inputStream = fs.open(hdfstoreSizepath);
|
||||
|
||||
final Long mdStoreSize = Long.parseLong(IOUtils.toString(inputStream));
|
||||
|
||||
inputStream.close();
|
||||
fs.create(hdfstoreSizepath);
|
||||
|
||||
DNetRestClient
|
||||
.doGET(String.format(COMMIT_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId(), mdStoreSize));
|
||||
break;
|
||||
}
|
||||
case ROLLBACK: {
|
||||
final String mdStoreVersion_params = argumentParser.get("mdStoreVersion");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
DNetRestClient.doGET(String.format(ROLLBACK_VERSION_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||
break;
|
||||
}
|
||||
|
||||
case READ_LOCK: {
|
||||
final String mdStoreID = argumentParser.get("mdStoreID");
|
||||
if (StringUtils.isBlank(mdStoreID)) {
|
||||
throw new IllegalArgumentException("missing or empty argument mdStoreId");
|
||||
}
|
||||
final MDStoreVersion currentVersion = DNetRestClient
|
||||
.doGET(String.format(READ_LOCK_URL, mdStoreManagerURI, mdStoreID), MDStoreVersion.class);
|
||||
populateOOZIEEnv(MDSTOREREADLOCKPARAM, MAPPER.writeValueAsString(currentVersion));
|
||||
break;
|
||||
}
|
||||
case READ_UNLOCK: {
|
||||
final String mdStoreVersion_params = argumentParser.get("readMDStoreId");
|
||||
final MDStoreVersion mdStoreVersion = MAPPER.readValue(mdStoreVersion_params, MDStoreVersion.class);
|
||||
|
||||
if (StringUtils.isBlank(mdStoreVersion.getId())) {
|
||||
throw new IllegalArgumentException(
|
||||
"invalid MDStoreVersion value current is " + mdStoreVersion_params);
|
||||
}
|
||||
DNetRestClient.doGET(String.format(READ_UNLOCK_URL, mdStoreManagerURI, mdStoreVersion.getId()));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
throw new IllegalArgumentException("invalid action");
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -1,6 +1,26 @@
|
||||
[
|
||||
{"paramName":"p", "paramLongName":"hdfsPath", "paramDescription": "the path where storing the sequential file", "paramRequired": true},
|
||||
{"paramName":"a", "paramLongName":"apidescriptor", "paramDescription": "the JSON encoding of the API Descriptor", "paramRequired": true},
|
||||
{"paramName":"n", "paramLongName":"namenode", "paramDescription": "the Name Node URI", "paramRequired": true},
|
||||
{"paramName":"w", "paramLongName":"workflowId", "paramDescription": "the identifier of the dnet Workflow", "paramRequired": false}
|
||||
{
|
||||
"paramName": "a",
|
||||
"paramLongName": "apidescriptor",
|
||||
"paramDescription": "the JSON encoding of the API Descriptor",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mv",
|
||||
"paramLongName": "mdStoreVersion",
|
||||
"paramDescription": "the MDStore Version bean",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "w",
|
||||
"paramLongName": "workflowId",
|
||||
"paramDescription": "the identifier of the dnet Workflow",
|
||||
"paramRequired": false
|
||||
}
|
||||
]
|
@ -0,0 +1,45 @@
|
||||
[
|
||||
{
|
||||
"paramName": "a",
|
||||
"paramLongName": "action",
|
||||
"paramDescription": "the JSON encoding of the API Descriptor",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mu",
|
||||
"paramLongName": "mdStoreManagerURI",
|
||||
"paramDescription": "the MDStore Manager URI",
|
||||
"paramRequired": true
|
||||
},
|
||||
{
|
||||
"paramName": "mi",
|
||||
"paramLongName": "mdStoreID",
|
||||
"paramDescription": "the Metadata Store ID",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "ms",
|
||||
"paramLongName": "mdStoreSize",
|
||||
"paramDescription": "the Metadata Store Size",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "mv",
|
||||
"paramLongName": "mdStoreVersion",
|
||||
"paramDescription": "the Metadata Version Bean",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "n",
|
||||
"paramLongName": "namenode",
|
||||
"paramDescription": "the Name Node URI",
|
||||
"paramRequired": false
|
||||
},
|
||||
{
|
||||
"paramName": "rm",
|
||||
"paramLongName": "readMDStoreId",
|
||||
"paramDescription": "the ID Locked to Read",
|
||||
"paramRequired": false
|
||||
}
|
||||
|
||||
]
|
@ -0,0 +1,199 @@
|
||||
|
||||
package eu.dnetlib.dhp.aggregation;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.io.IntWritable;
|
||||
import org.apache.hadoop.io.Text;
|
||||
import org.apache.spark.SparkConf;
|
||||
import org.apache.spark.api.java.JavaSparkContext;
|
||||
import org.apache.spark.api.java.function.MapFunction;
|
||||
import org.apache.spark.sql.Dataset;
|
||||
import org.apache.spark.sql.Encoder;
|
||||
import org.apache.spark.sql.Encoders;
|
||||
import org.apache.spark.sql.SparkSession;
|
||||
import org.junit.jupiter.api.*;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||
|
||||
import eu.dnetlib.data.mdstore.manager.common.model.MDStoreVersion;
|
||||
import eu.dnetlib.dhp.collection.GenerateNativeStoreSparkJob;
|
||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
||||
import eu.dnetlib.dhp.model.mdstore.MetadataRecord;
|
||||
import eu.dnetlib.dhp.transformation.TransformSparkJobNode;
|
||||
import eu.dnetlib.dhp.utils.ISLookupClientFactory;
|
||||
import eu.dnetlib.enabling.is.lookup.rmi.ISLookUpService;
|
||||
|
||||
@TestMethodOrder(MethodOrderer.OrderAnnotation.class)
|
||||
public class AggregationJobTest {
|
||||
|
||||
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
|
||||
|
||||
private static SparkSession spark;
|
||||
|
||||
private static Path workingDir;
|
||||
|
||||
private static Encoder<MetadataRecord> encoder;
|
||||
|
||||
private static final String encoding = "XML";
|
||||
private static final String dateOfCollection = System.currentTimeMillis() + "";
|
||||
private static final String xpath = "//*[local-name()='header']/*[local-name()='identifier']";
|
||||
private static String provenance;
|
||||
|
||||
private static final Logger log = LoggerFactory.getLogger(AggregationJobTest.class);
|
||||
|
||||
@BeforeAll
|
||||
public static void beforeAll() throws IOException {
|
||||
provenance = IOUtils
|
||||
.toString(AggregationJobTest.class.getResourceAsStream("/eu/dnetlib/dhp/collection/provenance.json"));
|
||||
workingDir = Files.createTempDirectory(AggregationJobTest.class.getSimpleName());
|
||||
log.info("using work dir {}", workingDir);
|
||||
|
||||
SparkConf conf = new SparkConf();
|
||||
|
||||
conf.setAppName(AggregationJobTest.class.getSimpleName());
|
||||
|
||||
conf.setMaster("local[*]");
|
||||
conf.set("spark.driver.host", "localhost");
|
||||
conf.set("hive.metastore.local", "true");
|
||||
conf.set("spark.ui.enabled", "false");
|
||||
conf.set("spark.sql.warehouse.dir", workingDir.toString());
|
||||
conf.set("hive.metastore.warehouse.dir", workingDir.resolve("warehouse").toString());
|
||||
|
||||
encoder = Encoders.bean(MetadataRecord.class);
|
||||
spark = SparkSession
|
||||
.builder()
|
||||
.appName(AggregationJobTest.class.getSimpleName())
|
||||
.config(conf)
|
||||
.getOrCreate();
|
||||
}
|
||||
|
||||
@AfterAll
|
||||
public static void afterAll() throws IOException {
|
||||
FileUtils.deleteDirectory(workingDir.toFile());
|
||||
spark.stop();
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(1)
|
||||
public void testGenerateNativeStoreSparkJobRefresh() throws Exception {
|
||||
|
||||
MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json");
|
||||
FileUtils.forceMkdir(new File(mdStoreV1.getHdfsPath()));
|
||||
|
||||
IOUtils
|
||||
.copy(
|
||||
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"),
|
||||
new FileOutputStream(mdStoreV1.getHdfsPath() + "/sequence_file"));
|
||||
|
||||
GenerateNativeStoreSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-encoding", encoding,
|
||||
"-dateOfCollection", dateOfCollection,
|
||||
"-provenance", provenance,
|
||||
"-xpath", xpath,
|
||||
"-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1),
|
||||
"-readMdStoreVersion", "",
|
||||
"-workflowId", "abc"
|
||||
});
|
||||
|
||||
verify(mdStoreV1);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(2)
|
||||
public void testGenerateNativeStoreSparkJobIncremental() throws Exception {
|
||||
|
||||
MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json");
|
||||
FileUtils.forceMkdir(new File(mdStoreV2.getHdfsPath()));
|
||||
|
||||
IOUtils
|
||||
.copy(
|
||||
getClass().getResourceAsStream("/eu/dnetlib/dhp/collection/sequence_file"),
|
||||
new FileOutputStream(mdStoreV2.getHdfsPath() + "/sequence_file"));
|
||||
|
||||
MDStoreVersion mdStoreV1 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_1.json");
|
||||
|
||||
GenerateNativeStoreSparkJob
|
||||
.main(
|
||||
new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-encoding", encoding,
|
||||
"-dateOfCollection", dateOfCollection,
|
||||
"-provenance", provenance,
|
||||
"-xpath", xpath,
|
||||
"-mdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2),
|
||||
"-readMdStoreVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV1),
|
||||
"-workflowId", "abc"
|
||||
});
|
||||
|
||||
verify(mdStoreV2);
|
||||
}
|
||||
|
||||
@Test
|
||||
@Order(3)
|
||||
public void testTransformSparkJob() throws Exception {
|
||||
|
||||
MDStoreVersion mdStoreV2 = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreVersion_2.json");
|
||||
MDStoreVersion mdStoreCleanedVersion = prepareVersion("/eu/dnetlib/dhp/collection/mdStoreCleanedVersion.json");
|
||||
|
||||
TransformSparkJobNode.main(new String[] {
|
||||
"-isSparkSessionManaged", Boolean.FALSE.toString(),
|
||||
"-dateOfTransformation", dateOfCollection,
|
||||
"-mdstoreInputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreV2),
|
||||
"-mdstoreOutputVersion", OBJECT_MAPPER.writeValueAsString(mdStoreCleanedVersion),
|
||||
"-transformationPlugin", "XSLT_TRANSFORM",
|
||||
"-isLookupUrl", "https://dev-openaire.d4science.org/is/services/isLookUp",
|
||||
"-transformationRuleId",
|
||||
"183dde52-a69b-4db9-a07e-1ef2be105294_VHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZXMvVHJhbnNmb3JtYXRpb25SdWxlRFNSZXNvdXJjZVR5cGU="
|
||||
});
|
||||
|
||||
}
|
||||
|
||||
protected void verify(MDStoreVersion mdStoreVersion) throws IOException {
|
||||
Assertions.assertTrue(new File(mdStoreVersion.getHdfsPath()).exists());
|
||||
|
||||
final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
|
||||
long seqFileSize = sc
|
||||
.sequenceFile(mdStoreVersion.getHdfsPath() + "/sequence_file", IntWritable.class, Text.class)
|
||||
.count();
|
||||
|
||||
final Dataset<MetadataRecord> mdstore = spark.read().load(mdStoreVersion.getHdfsPath() + "/store").as(encoder);
|
||||
long mdStoreSize = mdstore.count();
|
||||
|
||||
long declaredSize = Long.parseLong(IOUtils.toString(new FileReader(mdStoreVersion.getHdfsPath() + "/size")));
|
||||
|
||||
Assertions.assertEquals(seqFileSize, declaredSize, "the size must be equal");
|
||||
Assertions.assertEquals(seqFileSize, mdStoreSize, "the size must be equal");
|
||||
|
||||
long uniqueIds = mdstore
|
||||
.map((MapFunction<MetadataRecord, String>) MetadataRecord::getId, Encoders.STRING())
|
||||
.distinct()
|
||||
.count();
|
||||
|
||||
Assertions.assertEquals(seqFileSize, uniqueIds, "the size must be equal");
|
||||
}
|
||||
|
||||
private MDStoreVersion prepareVersion(String filename) throws IOException {
|
||||
MDStoreVersion mdstore = OBJECT_MAPPER
|
||||
.readValue(IOUtils.toString(getClass().getResource(filename)), MDStoreVersion.class);
|
||||
mdstore.setHdfsPath(String.format(mdstore.getHdfsPath(), workingDir.toString()));
|
||||
return mdstore;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
{
|
||||
"id":"md-cleaned",
|
||||
"mdstore":"md-cleaned",
|
||||
"writing":false,
|
||||
"readCount":1,
|
||||
"lastUpdate":1612187563099,
|
||||
"size":71,
|
||||
"hdfsPath":"%s/mdstore/md-cleaned"
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
{
|
||||
"id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187678801",
|
||||
"mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42",
|
||||
"writing":true,
|
||||
"readCount":0,
|
||||
"lastUpdate":null,
|
||||
"size":0,
|
||||
"hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v1"
|
||||
}
|
@ -0,0 +1,9 @@
|
||||
{
|
||||
"id":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42-1612187459108",
|
||||
"mdstore":"md-84e86d00-5771-4ed9-b17f-177ef4b46e42",
|
||||
"writing":false,
|
||||
"readCount":1,
|
||||
"lastUpdate":1612187563099,
|
||||
"size":71,
|
||||
"hdfsPath":"%s/mdstore/md-84e86d00-5771-4ed9-b17f-177ef4b46e42/v2"
|
||||
}
|
@ -0,0 +1,5 @@
|
||||
{
|
||||
"datasourceId":"74912366-d6df-49c1-a1fd-8a52fa98ce5f_UmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZXMvUmVwb3NpdG9yeVNlcnZpY2VSZXNvdXJjZVR5cGU\u003d",
|
||||
"datasourceName":"PSNC Institutional Repository",
|
||||
"nsPrefix":"psnc______pl"
|
||||
}
|
Binary file not shown.
Loading…
Reference in New Issue