package eu.dnetlib.dhp.sx.graph; import java.io.IOException; import java.net.URI; import java.util.ArrayList; import java.util.List; import java.util.Objects; import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Consumer; import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.bson.Document; import org.bson.conversions.Bson; import com.mongodb.DBObject; import com.mongodb.MongoClient; import com.mongodb.QueryBuilder; import com.mongodb.client.FindIterable; import com.mongodb.client.MongoCollection; import com.mongodb.client.MongoDatabase; import eu.dnetlib.dhp.application.ArgumentApplicationParser; /** * This job is responsible to collect data from mongoDatabase and store in a sequence File on HDFS Mongo database * contains information of each MDSTore in two collections: -metadata That contains info like: ID, format, layout, * interpretation -metadataManager: that contains info : ID, mongoCollectionName from the metadata collection we filter * the ids with Format, layout, and Interpretation from the metadataManager we get the current MONGO collection name * which contains metadata XML see function getCurrentId *
* This Job will be called different times in base at the triple we want import, and generates for each triple a * sequence file of XML */ public class ImportDataFromMongo { /** * It requires in input some parameters described on a file * eu/dnetlib/dhp/graph/sx/import_from_mongo_parameters.json *
* - the name node - the paht where store HDFS File - the mongo host - the mongo port - the metadata format to * import - the metadata layout to import - the metadata interpretation to import - the mongo database Name *
* This params are encoded into args
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
final ArgumentApplicationParser parser = new ArgumentApplicationParser(
IOUtils
.toString(
ImportDataFromMongo.class
.getResourceAsStream(
"/eu/dnetlib/dhp/sx/graph/argumentparser/import_from_mongo_parameters.json")));
parser.parseArgument(args);
final int port = Integer.parseInt(parser.get("dbport"));
final String host = parser.get("dbhost");
final String format = parser.get("format");
final String layout = parser.get("layout");
final String interpretation = parser.get("interpretation");
final String dbName = parser.get("dbName");
final MongoClient client = new MongoClient(host, port);
MongoDatabase database = client.getDatabase(dbName);
MongoCollection