forked from D-Net/dnet-hadoop
Merge pull request '8232-mdstore-synch-improve' (#272) from 8232-mdstore-synch-improve into beta
Reviewed-on: D-Net/dnet-hadoop#272
This commit is contained in:
commit
a590c371a9
|
@ -0,0 +1,100 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This utility represent the Metadata Store information
|
||||||
|
* needed during the migration from mongo to HDFS to store
|
||||||
|
*/
|
||||||
|
public class MDStoreInfo {
|
||||||
|
private String mdstore;
|
||||||
|
private String currentId;
|
||||||
|
private Long latestTimestamp;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiates a new Md store info.
|
||||||
|
*/
|
||||||
|
public MDStoreInfo() {
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Instantiates a new Md store info.
|
||||||
|
*
|
||||||
|
* @param mdstore the mdstore
|
||||||
|
* @param currentId the current id
|
||||||
|
* @param latestTimestamp the latest timestamp
|
||||||
|
*/
|
||||||
|
public MDStoreInfo(String mdstore, String currentId, Long latestTimestamp) {
|
||||||
|
this.mdstore = mdstore;
|
||||||
|
this.currentId = currentId;
|
||||||
|
this.latestTimestamp = latestTimestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets mdstore.
|
||||||
|
*
|
||||||
|
* @return the mdstore
|
||||||
|
*/
|
||||||
|
public String getMdstore() {
|
||||||
|
return mdstore;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets mdstore.
|
||||||
|
*
|
||||||
|
* @param mdstore the mdstore
|
||||||
|
* @return the mdstore
|
||||||
|
*/
|
||||||
|
public MDStoreInfo setMdstore(String mdstore) {
|
||||||
|
this.mdstore = mdstore;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets current id.
|
||||||
|
*
|
||||||
|
* @return the current id
|
||||||
|
*/
|
||||||
|
public String getCurrentId() {
|
||||||
|
return currentId;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets current id.
|
||||||
|
*
|
||||||
|
* @param currentId the current id
|
||||||
|
* @return the current id
|
||||||
|
*/
|
||||||
|
public MDStoreInfo setCurrentId(String currentId) {
|
||||||
|
this.currentId = currentId;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets latest timestamp.
|
||||||
|
*
|
||||||
|
* @return the latest timestamp
|
||||||
|
*/
|
||||||
|
public Long getLatestTimestamp() {
|
||||||
|
return latestTimestamp;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets latest timestamp.
|
||||||
|
*
|
||||||
|
* @param latestTimestamp the latest timestamp
|
||||||
|
* @return the latest timestamp
|
||||||
|
*/
|
||||||
|
public MDStoreInfo setLatestTimestamp(Long latestTimestamp) {
|
||||||
|
this.latestTimestamp = latestTimestamp;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public String toString() {
|
||||||
|
return "MDStoreInfo{" +
|
||||||
|
"mdstore='" + mdstore + '\'' +
|
||||||
|
", currentId='" + currentId + '\'' +
|
||||||
|
", latestTimestamp=" + latestTimestamp +
|
||||||
|
'}';
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,12 +1,12 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.common;
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
import static com.mongodb.client.model.Sorts.descending;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.*;
|
||||||
import java.util.HashMap;
|
import java.util.stream.Collectors;
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.stream.StreamSupport;
|
import java.util.stream.StreamSupport;
|
||||||
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
import org.apache.commons.lang3.StringUtils;
|
||||||
|
@ -38,6 +38,26 @@ public class MdstoreClient implements Closeable {
|
||||||
this.db = getDb(client, dbName);
|
this.db = getDb(client, dbName);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private Long parseTimestamp(Document f) {
|
||||||
|
if (f == null || !f.containsKey("timestamp"))
|
||||||
|
return null;
|
||||||
|
|
||||||
|
Object ts = f.get("timestamp");
|
||||||
|
|
||||||
|
return Long.parseLong(ts.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
public Long getLatestTimestamp(final String collectionId) {
|
||||||
|
MongoCollection<Document> collection = db.getCollection(collectionId);
|
||||||
|
FindIterable<Document> result = collection.find().sort(descending("timestamp")).limit(1);
|
||||||
|
if (result == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
Document f = result.first();
|
||||||
|
return parseTimestamp(f);
|
||||||
|
}
|
||||||
|
|
||||||
public MongoCollection<Document> mdStore(final String mdId) {
|
public MongoCollection<Document> mdStore(final String mdId) {
|
||||||
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
|
BasicDBObject query = (BasicDBObject) QueryBuilder.start("mdId").is(mdId).get();
|
||||||
|
|
||||||
|
@ -54,6 +74,16 @@ public class MdstoreClient implements Closeable {
|
||||||
return getColl(db, currentId, true);
|
return getColl(db, currentId, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public List<MDStoreInfo> mdStoreWithTimestamp(final String mdFormat, final String mdLayout,
|
||||||
|
final String mdInterpretation) {
|
||||||
|
Map<String, String> res = validCollections(mdFormat, mdLayout, mdInterpretation);
|
||||||
|
return res
|
||||||
|
.entrySet()
|
||||||
|
.stream()
|
||||||
|
.map(e -> new MDStoreInfo(e.getKey(), e.getValue(), getLatestTimestamp(e.getValue())))
|
||||||
|
.collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
public Map<String, String> validCollections(
|
public Map<String, String> validCollections(
|
||||||
final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
final String mdFormat, final String mdLayout, final String mdInterpretation) {
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
|
||||||
|
package eu.dnetlib.dhp.common;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.Paths;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.junit.jupiter.api.Test;
|
||||||
|
|
||||||
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
|
public class MdStoreClientTest {
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testMongoCollection() throws IOException {
|
||||||
|
final MdstoreClient client = new MdstoreClient("mongodb://localhost:27017", "mdstore");
|
||||||
|
|
||||||
|
final ObjectMapper mapper = new ObjectMapper();
|
||||||
|
|
||||||
|
final List<MDStoreInfo> infos = client.mdStoreWithTimestamp("ODF", "store", "cleaned");
|
||||||
|
|
||||||
|
infos.forEach(System.out::println);
|
||||||
|
|
||||||
|
final String s = mapper.writeValueAsString(infos);
|
||||||
|
|
||||||
|
Path fileName = Paths.get("/Users/sandro/mdstore_info.json");
|
||||||
|
|
||||||
|
// Writing into the file
|
||||||
|
Files.write(fileName, s.getBytes(StandardCharsets.UTF_8));
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -1,31 +1,87 @@
|
||||||
|
|
||||||
package eu.dnetlib.dhp.oa.graph.raw;
|
package eu.dnetlib.dhp.oa.graph.raw;
|
||||||
|
|
||||||
|
import static eu.dnetlib.dhp.utils.DHPUtils.getHadoopConfiguration;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.HashMap;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Map.Entry;
|
import java.util.Objects;
|
||||||
|
import java.util.function.Consumer;
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
|
import org.apache.hadoop.fs.LocatedFileStatus;
|
||||||
|
import org.apache.hadoop.fs.Path;
|
||||||
|
import org.apache.hadoop.fs.RemoteIterator;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
||||||
|
import eu.dnetlib.dhp.common.MDStoreInfo;
|
||||||
import eu.dnetlib.dhp.common.MdstoreClient;
|
import eu.dnetlib.dhp.common.MdstoreClient;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
import eu.dnetlib.dhp.oa.graph.raw.common.AbstractMigrationApplication;
|
||||||
|
|
||||||
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
|
public class MigrateMongoMdstoresApplication extends AbstractMigrationApplication implements Closeable {
|
||||||
|
|
||||||
private static final Logger log = LoggerFactory.getLogger(MigrateMongoMdstoresApplication.class);
|
private static final Logger log = LoggerFactory.getLogger(MigrateMongoMdstoresApplication.class);
|
||||||
|
|
||||||
private final MdstoreClient mdstoreClient;
|
private final MdstoreClient mdstoreClient;
|
||||||
|
|
||||||
|
private static List<MDStoreInfo> snapshotsMDStores(final MdstoreClient client,
|
||||||
|
final String format,
|
||||||
|
final String layout,
|
||||||
|
final String interpretation) {
|
||||||
|
return client.mdStoreWithTimestamp(format, layout, interpretation);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static MDStoreInfo extractPath(final String path, final String basePath) {
|
||||||
|
int res = path.indexOf(basePath);
|
||||||
|
if (res > 0) {
|
||||||
|
String[] split = path.substring(res).split("/");
|
||||||
|
if (split.length > 2) {
|
||||||
|
final String ts = split[split.length - 1];
|
||||||
|
final String mdStore = split[split.length - 2];
|
||||||
|
return new MDStoreInfo(mdStore, null, Long.parseLong(ts));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Map<String, MDStoreInfo> hdfsMDStoreInfo(FileSystem fs, final String basePath) throws IOException {
|
||||||
|
final Map<String, MDStoreInfo> hdfs_store = new HashMap<>();
|
||||||
|
final Path p = new Path(basePath);
|
||||||
|
final RemoteIterator<LocatedFileStatus> ls = fs.listFiles(p, true);
|
||||||
|
while (ls.hasNext()) {
|
||||||
|
|
||||||
|
String current = ls.next().getPath().toString();
|
||||||
|
|
||||||
|
final MDStoreInfo info = extractPath(current, basePath);
|
||||||
|
if (info != null) {
|
||||||
|
hdfs_store.put(info.getMdstore(), info);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return hdfs_store;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String createMDStoreDir(final String basePath, final String mdStoreId) {
|
||||||
|
if (basePath.endsWith("/")) {
|
||||||
|
return basePath + mdStoreId;
|
||||||
|
} else {
|
||||||
|
return String.format("%s/%s", basePath, mdStoreId);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public static void main(final String[] args) throws Exception {
|
public static void main(final String[] args) throws Exception {
|
||||||
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
|
||||||
IOUtils
|
IOUtils
|
||||||
.toString(
|
.toString(
|
||||||
|
Objects
|
||||||
|
.requireNonNull(
|
||||||
MigrateMongoMdstoresApplication.class
|
MigrateMongoMdstoresApplication.class
|
||||||
.getResourceAsStream("/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json")));
|
.getResourceAsStream(
|
||||||
|
"/eu/dnetlib/dhp/oa/graph/migrate_mongo_mstores_parameters.json"))));
|
||||||
parser.parseArgument(args);
|
parser.parseArgument(args);
|
||||||
|
|
||||||
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
final String mongoBaseUrl = parser.get("mongoBaseUrl");
|
||||||
|
@ -36,32 +92,120 @@ public class MigrateMongoMdstoresApplication extends AbstractMigrationApplicatio
|
||||||
final String mdInterpretation = parser.get("mdInterpretation");
|
final String mdInterpretation = parser.get("mdInterpretation");
|
||||||
|
|
||||||
final String hdfsPath = parser.get("hdfsPath");
|
final String hdfsPath = parser.get("hdfsPath");
|
||||||
|
final String nameNode = parser.get("nameNode");
|
||||||
|
|
||||||
|
final FileSystem fileSystem = FileSystem.get(getHadoopConfiguration(nameNode));
|
||||||
|
|
||||||
|
final MdstoreClient mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||||
|
|
||||||
|
final List<MDStoreInfo> mongoMDStores = snapshotsMDStores(mdstoreClient, mdFormat, mdLayout, mdInterpretation);
|
||||||
|
|
||||||
|
final Map<String, MDStoreInfo> hdfsMDStores = hdfsMDStoreInfo(fileSystem, hdfsPath);
|
||||||
|
|
||||||
|
mongoMDStores
|
||||||
|
.stream()
|
||||||
|
.filter(currentMDStore -> currentMDStore.getLatestTimestamp() != null)
|
||||||
|
.forEach(
|
||||||
|
consumeMDStore(
|
||||||
|
mdFormat, mdLayout, mdInterpretation, hdfsPath, fileSystem, mongoBaseUrl, mongoDb, hdfsMDStores));
|
||||||
|
|
||||||
|
// TODO: DELETE MDStORE FOLDER NOT PRESENT IN MONGO
|
||||||
|
|
||||||
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(hdfsPath, mongoBaseUrl,
|
|
||||||
mongoDb)) {
|
|
||||||
app.execute(mdFormat, mdLayout, mdInterpretation);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public MigrateMongoMdstoresApplication(
|
/**
|
||||||
final String hdfsPath, final String mongoBaseUrl, final String mongoDb) throws Exception {
|
* This method is responsible to sync only the stores that have been changed since last time
|
||||||
|
* @param mdFormat the MDStore's format
|
||||||
|
* @param mdLayout the MDStore'slayout
|
||||||
|
* @param mdInterpretation the MDStore's interpretation
|
||||||
|
* @param hdfsPath the basePath into hdfs where all MD-stores are stored
|
||||||
|
* @param fileSystem The Hadoop File system client
|
||||||
|
* @param hdfsMDStores A Map containing as Key the mdstore ID and as value the @{@link MDStoreInfo}
|
||||||
|
* @return
|
||||||
|
*/
|
||||||
|
private static Consumer<MDStoreInfo> consumeMDStore(String mdFormat, String mdLayout, String mdInterpretation,
|
||||||
|
String hdfsPath, FileSystem fileSystem, final String mongoBaseUrl, final String mongoDb,
|
||||||
|
Map<String, MDStoreInfo> hdfsMDStores) {
|
||||||
|
return currentMDStore -> {
|
||||||
|
// If the key is missing it means that the mdstore is not present in hdfs
|
||||||
|
// that is the hdfs path basePath/MDSTOREID/timestamp is missing
|
||||||
|
// So we have to synch it
|
||||||
|
if (!hdfsMDStores.containsKey(currentMDStore.getMdstore())) {
|
||||||
|
log.info("Adding store " + currentMDStore.getMdstore());
|
||||||
|
try {
|
||||||
|
synchMDStoreIntoHDFS(
|
||||||
|
mdFormat, mdLayout, mdInterpretation, hdfsPath, fileSystem, mongoBaseUrl, mongoDb,
|
||||||
|
currentMDStore);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException(e);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
final MDStoreInfo current = hdfsMDStores.get(currentMDStore.getMdstore());
|
||||||
|
// IF the key is present it means that in hdfs we have a path
|
||||||
|
// basePath/MDSTOREID/timestamp but the timestamp on hdfs is older that the
|
||||||
|
// new one in mongo so we have to synch the new mdstore and delete the old one
|
||||||
|
if (currentMDStore.getLatestTimestamp() > current.getLatestTimestamp()) {
|
||||||
|
log.info("Updating MDStore " + currentMDStore.getMdstore());
|
||||||
|
final String mdstoreDir = createMDStoreDir(hdfsPath, currentMDStore.getMdstore());
|
||||||
|
final String rmPath = createMDStoreDir(mdstoreDir, current.getLatestTimestamp().toString());
|
||||||
|
try {
|
||||||
|
synchMDStoreIntoHDFS(
|
||||||
|
mdFormat, mdLayout, mdInterpretation, hdfsPath, fileSystem, mongoBaseUrl, mongoDb,
|
||||||
|
currentMDStore);
|
||||||
|
log.info("deleting " + rmPath);
|
||||||
|
// DELETE THE OLD MDSTORE
|
||||||
|
fileSystem.delete(new Path(rmPath), true);
|
||||||
|
} catch (IOException e) {
|
||||||
|
throw new RuntimeException("Unable to synch and remove path " + rmPath, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*This method store into hdfs all the MONGO record of a single mdstore into the HDFS File
|
||||||
|
*
|
||||||
|
* @param mdFormat the MDStore's format
|
||||||
|
* @param mdLayout the MDStore'slayout
|
||||||
|
* @param mdInterpretation the MDStore's interpretation
|
||||||
|
* @param hdfsPath the basePath into hdfs where all MD-stores are stored
|
||||||
|
* @param fileSystem The Hadoop File system client
|
||||||
|
* @param currentMDStore The current Mongo MDStore ID
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
private static void synchMDStoreIntoHDFS(String mdFormat, String mdLayout, String mdInterpretation, String hdfsPath,
|
||||||
|
FileSystem fileSystem, final String mongoBaseUrl, final String mongoDb, MDStoreInfo currentMDStore)
|
||||||
|
throws IOException {
|
||||||
|
// FIRST CREATE the directory basePath/MDSTOREID
|
||||||
|
final String mdstoreDir = createMDStoreDir(hdfsPath, currentMDStore.getMdstore());
|
||||||
|
fileSystem.mkdirs(new Path(mdstoreDir));
|
||||||
|
// Then synch all the records into basePath/MDSTOREID/timestamp
|
||||||
|
final String currentIdDir = createMDStoreDir(mdstoreDir, currentMDStore.getLatestTimestamp().toString());
|
||||||
|
try (MigrateMongoMdstoresApplication app = new MigrateMongoMdstoresApplication(mongoBaseUrl, mongoDb,
|
||||||
|
currentIdDir)) {
|
||||||
|
app.execute(currentMDStore.getCurrentId(), mdFormat, mdLayout, mdInterpretation);
|
||||||
|
} catch (Exception e) {
|
||||||
|
throw new RuntimeException(
|
||||||
|
String
|
||||||
|
.format("Error on sync mdstore with ID %s into path %s", currentMDStore.getMdstore(), currentIdDir),
|
||||||
|
e);
|
||||||
|
}
|
||||||
|
log.info(String.format("Synchronized mdStore id : %s into path %s", currentMDStore.getMdstore(), currentIdDir));
|
||||||
|
}
|
||||||
|
|
||||||
|
public MigrateMongoMdstoresApplication(final String mongoBaseUrl, final String mongoDb, final String hdfsPath)
|
||||||
|
throws Exception {
|
||||||
super(hdfsPath);
|
super(hdfsPath);
|
||||||
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
this.mdstoreClient = new MdstoreClient(mongoBaseUrl, mongoDb);
|
||||||
}
|
}
|
||||||
|
|
||||||
public void execute(final String format, final String layout, final String interpretation) {
|
public void execute(final String currentColl, final String format, final String layout,
|
||||||
final Map<String, String> colls = mdstoreClient.validCollections(format, layout, interpretation);
|
final String interpretation) {
|
||||||
log.info("Found {} mdstores", colls.size());
|
|
||||||
|
|
||||||
for (final Entry<String, String> entry : colls.entrySet()) {
|
|
||||||
log.info("Processing mdstore {} (collection: {})", entry.getKey(), entry.getValue());
|
|
||||||
final String currentColl = entry.getValue();
|
|
||||||
|
|
||||||
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
for (final String xml : mdstoreClient.listRecords(currentColl)) {
|
||||||
emit(xml, String.format("%s-%s-%s", format, layout, interpretation));
|
emit(xml, String.format("%s-%s-%s", format, layout, interpretation));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void close() throws IOException {
|
public void close() throws IOException {
|
||||||
|
|
|
@ -3,33 +3,20 @@ package eu.dnetlib.dhp.oa.graph.raw.common;
|
||||||
|
|
||||||
import java.io.Closeable;
|
import java.io.Closeable;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.concurrent.atomic.AtomicInteger;
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.StringUtils;
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.fs.Path;
|
import org.apache.hadoop.fs.Path;
|
||||||
import org.apache.hadoop.io.SequenceFile;
|
import org.apache.hadoop.io.SequenceFile;
|
||||||
import org.apache.hadoop.io.Text;
|
import org.apache.hadoop.io.Text;
|
||||||
import org.apache.http.client.methods.CloseableHttpResponse;
|
|
||||||
import org.apache.http.client.methods.HttpGet;
|
|
||||||
import org.apache.http.impl.client.CloseableHttpClient;
|
|
||||||
import org.apache.http.impl.client.HttpClients;
|
|
||||||
|
|
||||||
import com.fasterxml.jackson.core.JsonProcessingException;
|
import com.fasterxml.jackson.core.JsonProcessingException;
|
||||||
import com.fasterxml.jackson.databind.ObjectMapper;
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
|
||||||
import eu.dnetlib.dhp.common.vocabulary.VocabularyGroup;
|
import eu.dnetlib.dhp.schema.oaf.Oaf;
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.OafToOafMapper;
|
|
||||||
import eu.dnetlib.dhp.oa.graph.raw.OdfToOafMapper;
|
|
||||||
import eu.dnetlib.dhp.schema.mdstore.MDStoreWithInfo;
|
|
||||||
import eu.dnetlib.dhp.schema.oaf.*;
|
|
||||||
import eu.dnetlib.dhp.utils.DHPUtils;
|
import eu.dnetlib.dhp.utils.DHPUtils;
|
||||||
|
|
||||||
public class AbstractMigrationApplication implements Closeable {
|
public class AbstractMigrationApplication implements Closeable {
|
||||||
|
|
|
@ -5,6 +5,12 @@
|
||||||
"paramDescription": "the path where storing the sequential file",
|
"paramDescription": "the path where storing the sequential file",
|
||||||
"paramRequired": true
|
"paramRequired": true
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
"paramName": "n",
|
||||||
|
"paramLongName": "nameNode",
|
||||||
|
"paramDescription": "the hdfs Name node url",
|
||||||
|
"paramRequired": true
|
||||||
|
},
|
||||||
{
|
{
|
||||||
"paramName": "mongourl",
|
"paramName": "mongourl",
|
||||||
"paramLongName": "mongoBaseUrl",
|
"paramLongName": "mongoBaseUrl",
|
||||||
|
|
|
@ -301,6 +301,7 @@
|
||||||
<arg>--mdFormat</arg><arg>ODF</arg>
|
<arg>--mdFormat</arg><arg>ODF</arg>
|
||||||
<arg>--mdLayout</arg><arg>store</arg>
|
<arg>--mdLayout</arg><arg>store</arg>
|
||||||
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="reuse_oaf"/>
|
<ok to="reuse_oaf"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -326,6 +327,7 @@
|
||||||
<arg>--mdFormat</arg><arg>OAF</arg>
|
<arg>--mdFormat</arg><arg>OAF</arg>
|
||||||
<arg>--mdLayout</arg><arg>store</arg>
|
<arg>--mdLayout</arg><arg>store</arg>
|
||||||
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
<arg>--mdInterpretation</arg><arg>cleaned</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="ImportOAF_invisible"/>
|
<ok to="ImportOAF_invisible"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -343,6 +345,7 @@
|
||||||
<arg>--mdFormat</arg><arg>OAF</arg>
|
<arg>--mdFormat</arg><arg>OAF</arg>
|
||||||
<arg>--mdLayout</arg><arg>store</arg>
|
<arg>--mdLayout</arg><arg>store</arg>
|
||||||
<arg>--mdInterpretation</arg><arg>intersection</arg>
|
<arg>--mdInterpretation</arg><arg>intersection</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="ImportODF_hdfs"/>
|
<ok to="ImportODF_hdfs"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
|
@ -4,18 +4,6 @@
|
||||||
<name>migrationPathStep1</name>
|
<name>migrationPathStep1</name>
|
||||||
<description>the base path to store hdfs file</description>
|
<description>the base path to store hdfs file</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>postgresURL</name>
|
|
||||||
<description>the postgres URL to access to the database</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>postgresUser</name>
|
|
||||||
<description>the user postgres</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>postgresPassword</name>
|
|
||||||
<description>the password postgres</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
<property>
|
||||||
<name>mongoURL</name>
|
<name>mongoURL</name>
|
||||||
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
<description>mongoDB url, example: mongodb://[username:password@]host[:port]</description>
|
||||||
|
@ -24,27 +12,6 @@
|
||||||
<name>mongoDb</name>
|
<name>mongoDb</name>
|
||||||
<description>mongo database</description>
|
<description>mongo database</description>
|
||||||
</property>
|
</property>
|
||||||
<property>
|
|
||||||
<name>isLookupUrl</name>
|
|
||||||
<description>the address of the lookUp service</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>nsPrefixBlacklist</name>
|
|
||||||
<value></value>
|
|
||||||
<description>a blacklist of nsprefixes (comma separeted)</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkDriverMemory</name>
|
|
||||||
<description>memory for driver process</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorMemory</name>
|
|
||||||
<description>memory for individual executor</description>
|
|
||||||
</property>
|
|
||||||
<property>
|
|
||||||
<name>sparkExecutorCores</name>
|
|
||||||
<description>number of cores used by single executor</description>
|
|
||||||
</property>
|
|
||||||
</parameters>
|
</parameters>
|
||||||
|
|
||||||
<start to="ResetWorkingPath"/>
|
<start to="ResetWorkingPath"/>
|
||||||
|
@ -58,37 +25,22 @@
|
||||||
<delete path='${migrationPathStep1}'/>
|
<delete path='${migrationPathStep1}'/>
|
||||||
<mkdir path='${migrationPathStep1}'/>
|
<mkdir path='${migrationPathStep1}'/>
|
||||||
</fs>
|
</fs>
|
||||||
<ok to="ImportDB"/>
|
|
||||||
<error to="Kill"/>
|
|
||||||
</action>
|
|
||||||
|
|
||||||
<action name="ImportDB">
|
|
||||||
<java>
|
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
|
||||||
<name-node>${nameNode}</name-node>
|
|
||||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateDbEntitiesApplication</main-class>
|
|
||||||
<arg>-p</arg><arg>${migrationPathStep1}/db_records</arg>
|
|
||||||
<arg>-pgurl</arg><arg>${postgresURL}</arg>
|
|
||||||
<arg>-pguser</arg><arg>${postgresUser}</arg>
|
|
||||||
<arg>-pgpasswd</arg><arg>${postgresPassword}</arg>
|
|
||||||
<arg>-islookup</arg><arg>${isLookupUrl}</arg>
|
|
||||||
<arg>--nsPrefixBlacklist</arg><arg>${nsPrefixBlacklist}</arg>
|
|
||||||
</java>
|
|
||||||
<ok to="ImportODF"/>
|
<ok to="ImportODF"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<action name="ImportODF">
|
<action name="ImportODF">
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||||
<arg>-p</arg><arg>${migrationPathStep1}/odf_records</arg>
|
<arg>-p</arg><arg>${migrationPathStep1}</arg>
|
||||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||||
<arg>-f</arg><arg>ODF</arg>
|
<arg>-f</arg><arg>ODF</arg>
|
||||||
<arg>-l</arg><arg>store</arg>
|
<arg>-l</arg><arg>store</arg>
|
||||||
<arg>-i</arg><arg>cleaned</arg>
|
<arg>-i</arg><arg>cleaned</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
|
|
||||||
</java>
|
</java>
|
||||||
<ok to="ImportOAF"/>
|
<ok to="ImportOAF"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
@ -98,13 +50,14 @@
|
||||||
<java>
|
<java>
|
||||||
<job-tracker>${jobTracker}</job-tracker>
|
<job-tracker>${jobTracker}</job-tracker>
|
||||||
<name-node>${nameNode}</name-node>
|
<name-node>${nameNode}</name-node>
|
||||||
<main-class>eu.dnetlib.dhp.migration.step1.MigrateMongoMdstoresApplication</main-class>
|
<main-class>eu.dnetlib.dhp.oa.graph.raw.MigrateMongoMdstoresApplication</main-class>
|
||||||
<arg>-p</arg><arg>${migrationPathStep1}/oaf_records</arg>
|
<arg>-p</arg><arg>${migrationPathStep1}</arg>
|
||||||
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
<arg>-mongourl</arg><arg>${mongoURL}</arg>
|
||||||
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
<arg>-mongodb</arg><arg>${mongoDb}</arg>
|
||||||
<arg>-f</arg><arg>OAF</arg>
|
<arg>-f</arg><arg>OAF</arg>
|
||||||
<arg>-l</arg><arg>store</arg>
|
<arg>-l</arg><arg>store</arg>
|
||||||
<arg>-i</arg><arg>cleaned</arg>
|
<arg>-i</arg><arg>cleaned</arg>
|
||||||
|
<arg>--nameNode</arg><arg>${nameNode}</arg>
|
||||||
</java>
|
</java>
|
||||||
<ok to="End"/>
|
<ok to="End"/>
|
||||||
<error to="Kill"/>
|
<error to="Kill"/>
|
||||||
|
|
Loading…
Reference in New Issue