-

2021-06-21 09:16:31 +02:00 · 2021-06-21 09:16:31 +02:00 · c07f820c21
parent 2740b95f99
commit c07f820c21
1 changed files with 27 additions and 50 deletions
--- a/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java
+++ b/dhp-workflows/dhp-doiboost/src/main/java/eu/dnetlib/doiboost/crossref/ExtractCrossrefRecords.java
@ -1,11 +1,8 @@
 package eu.dnetlib.doiboost.crossref;
 import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 import java.io.BufferedOutputStream;
 import java.net.URI;
 import java.util.Optional;
 import java.util.zip.GZIPOutputStream;
 import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
@ -17,13 +14,11 @@ import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FSDataOutputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.SparkConf;
 import org.apache.spark.util.LongAccumulator;
 import org.mortbay.log.Log;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
-public class SparkExtractCrossrefRecords {
+public class ExtractCrossrefRecords {
 	public static void main(String[] args) throws Exception {
 		String hdfsServerUri;
 		String workingPath;
@ -31,58 +26,40 @@ public class SparkExtractCrossrefRecords {
 		final ArgumentApplicationParser parser = new ArgumentApplicationParser(
 			IOUtils
 				.toString(
-					SparkExtractCrossrefRecords.class
+					ExtractCrossrefRecords.class
 						.getResourceAsStream(
 							"/eu/dnetlib/dhp/doiboost/crossref_dump_reader.json")));
 		parser.parseArgument(args);
 		hdfsServerUri = parser.get("hdfsServerUri");
 		workingPath = parser.get("workingPath");
 		crossrefFileNameTarGz = parser.get("crossrefFileNameTarGz");
 		Boolean isSparkSessionManaged = Optional
 			.ofNullable(parser.get("isSparkSessionManaged"))
 			.map(Boolean::valueOf)
 			.orElse(Boolean.TRUE);
 		SparkConf sparkConf = new SparkConf();
 		runWithSparkSession(
 			sparkConf,
 			isSparkSessionManaged,
 			spark -> {
 				LongAccumulator filesCounter = spark
 					.sparkContext()
 					.longAccumulator("filesCounter");
 				Path hdfsreadpath = new Path(hdfsServerUri.concat(workingPath).concat(crossrefFileNameTarGz));
 				Configuration conf = new Configuration();
 				conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
 				conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 				conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
 				FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 				FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
 				try (TarArchiveInputStream tais = new TarArchiveInputStream(
 					new GzipCompressorInputStream(crossrefFileStream))) {
 					TarArchiveEntry entry = null;
 					while ((entry = tais.getNextTarEntry()) != null) {
 						if (entry.isDirectory()) {
 						} else {
 							FSDataOutputStream out = fs
 								.create(new Path(workingPath.concat("filess/").concat(entry.getName())));
 							GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out));
 							try {
 								byte[] b = new byte[1024];
 								int numBytes = 0;
 								while ((numBytes = tais.read(b)) != -1) {
 									gzipOs.write(b, 0, numBytes);
 								}
                                filesCounter.add(1);
 							} finally {
 								IOUtils.closeQuietly(out);
 								IOUtils.closeQuietly(gzipOs);
 							}
-						}
+		Path hdfsreadpath = new Path(hdfsServerUri.concat(workingPath).concat(crossrefFileNameTarGz));
 		Configuration conf = new Configuration();
 		conf.set("fs.defaultFS", hdfsServerUri.concat(workingPath));
 		conf.set("fs.hdfs.impl", org.apache.hadoop.hdfs.DistributedFileSystem.class.getName());
 		conf.set("fs.file.impl", org.apache.hadoop.fs.LocalFileSystem.class.getName());
 		FileSystem fs = FileSystem.get(URI.create(hdfsServerUri.concat(workingPath)), conf);
 		FSDataInputStream crossrefFileStream = fs.open(hdfsreadpath);
 		try (TarArchiveInputStream tais = new TarArchiveInputStream(
 			new GzipCompressorInputStream(crossrefFileStream))) {
 			TarArchiveEntry entry = null;
 			while ((entry = tais.getNextTarEntry()) != null) {
 				if (entry.isDirectory()) {
 				} else {
 					try (
 						FSDataOutputStream out = fs
 							.create(new Path(workingPath.concat("filess/").concat(entry.getName()).concat(".gz")));
 						GZIPOutputStream gzipOs = new GZIPOutputStream(new BufferedOutputStream(out))) {
 						IOUtils.copy(tais, gzipOs);
 					}
 				}
-				Log.info("Crossref dump reading completed");
+			}
-				Log.info("Files counter: " + filesCounter.value());
+		}
-			});
+		Log.info("Crossref dump reading completed");
 	}
 }