added option to split dumps bigger that 40Gb on different files

2020-10-30 14:09:04 +01:00 · 2020-10-30 14:09:04 +01:00 · 14bf2e7238
parent 78fdb11c3f
commit 14bf2e7238
4 changed files with 85 additions and 1 deletions
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/MakeTar.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/MakeTar.java
@ -59,7 +59,79 @@ public class MakeTar implements Serializable {
 			String p_string = p.toString();
 			String entity = p_string.substring(p_string.lastIndexOf("/") + 1);

-			write(fileSystem, p_string, outputPath + "/" + entity + ".tar", entity);
+			writeMaxSize(fileSystem, p_string, outputPath + "/" + entity, entity, 40);
+		}
+
+	}
+
+	private static TarArchiveOutputStream getTar(FileSystem fileSystem, String outputPath) throws IOException {
+		Path hdfsWritePath = new Path(outputPath);
+		FSDataOutputStream fsDataOutputStream = null;
+		if (fileSystem.exists(hdfsWritePath)) {
+			fileSystem.delete(hdfsWritePath, true);
+
+		}
+		fsDataOutputStream = fileSystem.create(hdfsWritePath);
+
+		return new TarArchiveOutputStream(fsDataOutputStream.getWrappedStream());
+	}
+
+	private static void writeMaxSize(FileSystem fileSystem, String inputPath, String outputPath, String dir_name,
+		int gBperSplit) throws IOException {
+		final long bytesPerSplit = 1024L * 1024L * 1024L * gBperSplit;
+
+		long sourceSize = fileSystem.getContentSummary(new Path(inputPath)).getSpaceConsumed();
+
+		final long numberOfSplits = sourceSize / bytesPerSplit;
+
+		if (numberOfSplits <= 1) {
+			write(fileSystem, inputPath, outputPath + ".tar", dir_name);
+		} else {
+			int partNum = 0;
+			long remainingBytes = sourceSize % bytesPerSplit;
+
+			RemoteIterator<LocatedFileStatus> fileStatusListIterator = fileSystem
+				.listFiles(
+					new Path(inputPath), true);
+			while (sourceSize > 0) {
+				TarArchiveOutputStream ar = getTar(fileSystem, outputPath + "_" + partNum + ".tar");
+
+				long current_size = 0;
+				while (fileStatusListIterator.hasNext() && current_size < bytesPerSplit) {
+					LocatedFileStatus fileStatus = fileStatusListIterator.next();
+
+					Path p = fileStatus.getPath();
+					String p_string = p.toString();
+					if (!p_string.endsWith("_SUCCESS")) {
+						String name = p_string.substring(p_string.lastIndexOf("/") + 1);
+						if (name.trim().equalsIgnoreCase("communities_infrastructures")) {
+							name = "communities_infrastructures.json";
+						}
+						TarArchiveEntry entry = new TarArchiveEntry(dir_name + "/" + name);
+						entry.setSize(fileStatus.getLen());
+						current_size += fileStatus.getLen();
+						ar.putArchiveEntry(entry);
+
+						InputStream is = fileSystem.open(fileStatus.getPath());
+
+						BufferedInputStream bis = new BufferedInputStream(is);
+
+						int count;
+						byte data[] = new byte[1024];
+						while ((count = bis.read(data, 0, data.length)) != -1) {
+							ar.write(data, 0, count);
+						}
+						bis.close();
+						ar.closeArchiveEntry();
+
+					}
+
+				}
+				sourceSize = sourceSize - current_size;
+				partNum += 1;
+				ar.close();
+			}
+
 		}

 	}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/OrganizationInfo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/OrganizationInfo.java
@ -0,0 +1,4 @@
+package eu.dnetlib.dhp.oa.graph.dump.ttl;
+
+public class OrganizationInfo {
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/Pids.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/Pids.java
@ -0,0 +1,4 @@
+package eu.dnetlib.dhp.oa.graph.dump.ttl;
+
+public class Pids {
+}
--- a/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/SparkPrepareOrganizationInfo.java
+++ b/dhp-workflows/dhp-graph-mapper/src/main/java/eu/dnetlib/dhp/oa/graph/dump/ttl/SparkPrepareOrganizationInfo.java
@ -0,0 +1,4 @@
+package eu.dnetlib.dhp.oa.graph.dump.ttl;
+
+public class SparkPrepareOrganizationInfo {
+}