accounting-aggregator-se-pl.../src/test/java/org/gcube/accounting/aggregator/file/EliminateDuplicates.java

package org.gcube.accounting.aggregator.file;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;

import org.gcube.accounting.aggregator.ContextTest;
import org.gcube.accounting.aggregator.aggregation.AggregationInfo;
import org.gcube.accounting.aggregator.aggregation.AggregationType;
import org.gcube.accounting.aggregator.directory.FileSystemDirectoryStructure;
import org.gcube.accounting.aggregator.elaboration.Elaborator;
import org.gcube.accounting.aggregator.utility.Utility;
import org.gcube.accounting.datamodel.usagerecords.StorageStatusRecord;
import org.gcube.com.fasterxml.jackson.databind.JsonNode;
import org.gcube.documentstore.records.DSMapper;
import org.gcube.documentstore.records.Record;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author Luca Frosini (ISTI - CNR)
 * This class has been used to eliminates duplicates from a file
 * because the aggregation was started concurrently twice.
 */
public class EliminateDuplicates extends ContextTest {

	private static Logger logger = LoggerFactory.getLogger(EliminateDuplicates.class);

	public final static String ORIGINAL_NO_DUPLICATES_SUFFIX = ".original-no-duplicates.json";

	protected AggregationInfo aggregationInfo;
	protected File originalRecordsbackupFile;
	protected File noDuplicatesRecordsbackupFile;

	protected int readLines;
	protected int discardedLines;
	protected int uniqueLines;
	protected Set<String> ids;

	protected File getAggregatedRecordsBackupFile() throws Exception {
		File aggregateRecordsBackupFile = new File(originalRecordsbackupFile.getParentFile(),
				originalRecordsbackupFile.getName().replace(Elaborator.ORIGINAL_SUFFIX, ORIGINAL_NO_DUPLICATES_SUFFIX));
		return aggregateRecordsBackupFile;
	}

	@Ignore
	@Test
	public void eliminateDuplicates() throws Exception {
		ids = new HashSet<>();
		String recordType = StorageStatusRecord.class.getSimpleName();
		AggregationType aggregationType = AggregationType.MONTHLY;
		Calendar start = Utility.getAggregationStartCalendar(2022, Calendar.JANUARY, 1);
		Calendar end = Utility.getEndCalendarFromStartCalendar(aggregationType, start, 1);
		Date aggregationStartDate = start.getTime();
		Date aggregationEndDate = end.getTime();
		aggregationInfo = new AggregationInfo(recordType, aggregationType, aggregationStartDate, aggregationEndDate);
		FileSystemDirectoryStructure fileSystemDirectoryStructure = new FileSystemDirectoryStructure();
		File elaborationDirectory = fileSystemDirectoryStructure.getTargetFolder(aggregationType, aggregationStartDate);
		originalRecordsbackupFile = Elaborator.getOriginalRecordsBackupFile(elaborationDirectory, aggregationInfo);
		noDuplicatesRecordsbackupFile = getAggregatedRecordsBackupFile();
		noDuplicatesRecordsbackupFile.delete();
		// readFile();
	}

	protected void elaborateLine(String line) throws Exception {
		JsonNode jsonNode = DSMapper.asJsonNode(line);
		String id = jsonNode.get(Record.ID).asText();
		if(!ids.contains(id)) {
			ids.add(id);
			++uniqueLines;
			Utility.printLine(noDuplicatesRecordsbackupFile, line);
		}else {
			logger.trace("Record with id {} was already found, it will be discarded.", id);
			++discardedLines;
		}

	}


	protected void readFile() throws Exception {
		FileInputStream fstream = null;
		DataInputStream in = null;
		BufferedReader br = null;
		try {
			// Open the file that is the first // command line parameter
			fstream = new FileInputStream(originalRecordsbackupFile);
			// Get the object of DataInputStream
			in = new DataInputStream(fstream);
			br = new BufferedReader(new InputStreamReader(in));

			readLines = 0;
			discardedLines = 0;
			uniqueLines = 0;

			String line;
			// Read File Line By Line
			while((line = br.readLine()) != null) {
				elaborateLine(line);
				++readLines;
			}

			logger.info("Original records are {}. Unique records are {}. Discarded duplicates records are {}", readLines, uniqueLines, discardedLines);

			Assert.assertTrue(readLines == (uniqueLines+discardedLines));

		} finally {
			if(br != null) {
				br.close();
			}
			if(in != null) {
				in.close();
			}
			if(fstream != null) {
				fstream.close();
			}

		}

	}
}