2020-03-27 13:48:44 +01:00
|
|
|
package eu.dnetlib.dhp.sx.graph;
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-03-27 10:42:17 +01:00
|
|
|
import com.mongodb.DBObject;
|
|
|
|
import com.mongodb.MongoClient;
|
|
|
|
import com.mongodb.QueryBuilder;
|
2020-02-19 10:07:08 +01:00
|
|
|
import com.mongodb.client.FindIterable;
|
|
|
|
import com.mongodb.client.MongoCollection;
|
|
|
|
import com.mongodb.client.MongoDatabase;
|
|
|
|
import eu.dnetlib.dhp.application.ArgumentApplicationParser;
|
2020-04-18 12:42:58 +02:00
|
|
|
import java.io.IOException;
|
|
|
|
import java.net.URI;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.Objects;
|
|
|
|
import java.util.concurrent.atomic.AtomicInteger;
|
|
|
|
import java.util.function.Consumer;
|
|
|
|
import java.util.stream.Collectors;
|
2020-02-19 10:07:08 +01:00
|
|
|
import org.apache.commons.io.IOUtils;
|
|
|
|
import org.apache.hadoop.conf.Configuration;
|
|
|
|
import org.apache.hadoop.fs.FileSystem;
|
|
|
|
import org.apache.hadoop.fs.Path;
|
|
|
|
import org.apache.hadoop.io.IntWritable;
|
|
|
|
import org.apache.hadoop.io.SequenceFile;
|
|
|
|
import org.apache.hadoop.io.Text;
|
|
|
|
import org.bson.Document;
|
|
|
|
import org.bson.conversions.Bson;
|
|
|
|
|
2020-03-27 13:16:24 +01:00
|
|
|
/**
|
2020-04-18 12:42:58 +02:00
|
|
|
* This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS
|
|
|
|
* Mongo database contains information of each MDSTore in two collections: -metadata That contains
|
|
|
|
* info like: ID, format, layout, interpretation -metadataManager: that contains info : ID,
|
|
|
|
* mongoCollectionName from the metadata collection we filter the ids with Format, layout, and
|
|
|
|
* Interpretation from the metadataManager we get the current MONGO collection name which contains
|
|
|
|
* metadata XML see function getCurrentId
|
2020-03-27 13:16:24 +01:00
|
|
|
*
|
2020-04-18 12:42:58 +02:00
|
|
|
* <p>This Job will be called different times in base at the triple we want import, and generates
|
|
|
|
* for each triple a sequence file of XML
|
2020-03-27 13:16:24 +01:00
|
|
|
*/
|
|
|
|
public class ImportDataFromMongo {
|
2020-04-27 14:45:40 +02:00
|
|
|
/**
|
|
|
|
* It requires in input some parameters described on a file
|
|
|
|
* eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json
|
|
|
|
*
|
|
|
|
* <p>- the name node - the paht where store HDFS File - the mongo host - the mongo port - the
|
|
|
|
* metadata format to import - the metadata layout to import - the metadata interpretation to
|
|
|
|
* import - the mongo database Name
|
|
|
|
*
|
|
|
|
* <p>This params are encoded into args
|
|
|
|
*
|
|
|
|
* @param args
|
|
|
|
* @throws Exception
|
|
|
|
*/
|
|
|
|
public static void main(String[] args) throws Exception {
|
|
|
|
final ArgumentApplicationParser parser =
|
|
|
|
new ArgumentApplicationParser(
|
|
|
|
IOUtils.toString(
|
|
|
|
ImportDataFromMongo.class.getResourceAsStream(
|
|
|
|
"/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json")));
|
|
|
|
parser.parseArgument(args);
|
|
|
|
final int port = Integer.parseInt(parser.get("dbport"));
|
|
|
|
final String host = parser.get("dbhost");
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final String format = parser.get("format");
|
|
|
|
final String layout = parser.get("layout");
|
|
|
|
final String interpretation = parser.get("interpretation");
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final String dbName = parser.get("dbName");
|
|
|
|
final MongoClient client = new MongoClient(host, port);
|
|
|
|
MongoDatabase database = client.getDatabase(dbName);
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
MongoCollection<Document> metadata = database.getCollection("metadata");
|
|
|
|
MongoCollection<Document> metadataManager = database.getCollection("metadataManager");
|
|
|
|
final DBObject query =
|
|
|
|
QueryBuilder.start("format")
|
|
|
|
.is(format)
|
|
|
|
.and("layout")
|
|
|
|
.is(layout)
|
|
|
|
.and("interpretation")
|
|
|
|
.is(interpretation)
|
|
|
|
.get();
|
|
|
|
final List<String> ids = new ArrayList<>();
|
|
|
|
metadata
|
|
|
|
.find((Bson) query)
|
|
|
|
.forEach((Consumer<Document>) document -> ids.add(document.getString("mdId")));
|
|
|
|
List<String> databaseId =
|
|
|
|
ids.stream()
|
|
|
|
.map(it -> getCurrentId(it, metadataManager))
|
|
|
|
.filter(Objects::nonNull)
|
|
|
|
.collect(Collectors.toList());
|
2020-03-27 13:16:24 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final String hdfsuri = parser.get("namenode");
|
|
|
|
// ====== Init HDFS File System Object
|
|
|
|
Configuration conf = new Configuration();
|
|
|
|
// Set FileSystem URI
|
|
|
|
conf.set("fs.defaultFS", hdfsuri);
|
|
|
|
// Because of Maven
|
|
|
|
conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
|
|
|
|
conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
FileSystem.get(URI.create(hdfsuri), conf);
|
|
|
|
Path hdfswritepath = new Path(parser.get("targetPath"));
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
final AtomicInteger counter = new AtomicInteger(0);
|
|
|
|
try (SequenceFile.Writer writer =
|
|
|
|
SequenceFile.createWriter(
|
|
|
|
conf,
|
|
|
|
SequenceFile.Writer.file(hdfswritepath),
|
|
|
|
SequenceFile.Writer.keyClass(IntWritable.class),
|
|
|
|
SequenceFile.Writer.valueClass(Text.class))) {
|
|
|
|
final IntWritable key = new IntWritable(counter.get());
|
|
|
|
final Text value = new Text();
|
|
|
|
databaseId.forEach(
|
|
|
|
id -> {
|
|
|
|
System.out.println("Reading :" + id);
|
|
|
|
MongoCollection<Document> collection = database.getCollection(id);
|
|
|
|
collection
|
|
|
|
.find()
|
|
|
|
.forEach(
|
|
|
|
(Consumer<Document>)
|
|
|
|
document -> {
|
|
|
|
key.set(counter.getAndIncrement());
|
|
|
|
value.set(document.getString("body"));
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
if (counter.get() % 10000 == 0) {
|
|
|
|
System.out.println("Added " + counter.get());
|
|
|
|
}
|
|
|
|
try {
|
|
|
|
writer.append(key, value);
|
|
|
|
} catch (IOException e) {
|
|
|
|
throw new RuntimeException(e);
|
|
|
|
}
|
|
|
|
});
|
|
|
|
});
|
2020-02-19 10:07:08 +01:00
|
|
|
}
|
2020-04-27 14:45:40 +02:00
|
|
|
}
|
2020-02-19 10:07:08 +01:00
|
|
|
|
2020-04-27 14:45:40 +02:00
|
|
|
/**
|
|
|
|
* Return the name of mongo collection giving an MdStore ID
|
|
|
|
*
|
|
|
|
* @param mdId The id of the MDStore
|
|
|
|
* @param metadataManager The collection metadataManager on mongo which contains this information
|
|
|
|
* @return
|
|
|
|
*/
|
|
|
|
private static String getCurrentId(
|
|
|
|
final String mdId, final MongoCollection<Document> metadataManager) {
|
|
|
|
FindIterable<Document> result =
|
|
|
|
metadataManager.find((Bson) QueryBuilder.start("mdId").is(mdId).get());
|
|
|
|
final Document item = result.first();
|
|
|
|
return item == null ? null : item.getString("currentId");
|
|
|
|
}
|
2020-02-19 10:07:08 +01:00
|
|
|
}
|