accounting-aggregator-se-pl.../src/test/java/org/gcube/accounting/aggregator/file/EliminateDuplicates.java

130 lines
4.3 KiB
Java

package org.gcube.accounting.aggregator.file;
import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Calendar;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import org.gcube.accounting.aggregator.ContextTest;
import org.gcube.accounting.aggregator.aggregation.AggregationInfo;
import org.gcube.accounting.aggregator.aggregation.AggregationType;
import org.gcube.accounting.aggregator.directory.FileSystemDirectoryStructure;
import org.gcube.accounting.aggregator.elaboration.Elaborator;
import org.gcube.accounting.aggregator.utility.Utility;
import org.gcube.accounting.datamodel.usagerecords.StorageStatusRecord;
import org.gcube.com.fasterxml.jackson.databind.JsonNode;
import org.gcube.documentstore.records.DSMapper;
import org.gcube.documentstore.records.Record;
import org.junit.Assert;
import org.junit.Ignore;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author Luca Frosini (ISTI - CNR)
* This class has been used to eliminates duplicates from a file
* because the aggregation was started concurrently twice.
*/
public class EliminateDuplicates extends ContextTest {
private static Logger logger = LoggerFactory.getLogger(EliminateDuplicates.class);
public final static String ORIGINAL_NO_DUPLICATES_SUFFIX = ".original-no-duplicates.json";
protected AggregationInfo aggregationInfo;
protected File originalRecordsbackupFile;
protected File noDuplicatesRecordsbackupFile;
protected int readLines;
protected int discardedLines;
protected int uniqueLines;
protected Set<String> ids;
protected File getAggregatedRecordsBackupFile() throws Exception {
File aggregateRecordsBackupFile = new File(originalRecordsbackupFile.getParentFile(),
originalRecordsbackupFile.getName().replace(Elaborator.ORIGINAL_SUFFIX, ORIGINAL_NO_DUPLICATES_SUFFIX));
return aggregateRecordsBackupFile;
}
@Ignore
@Test
public void eliminateDuplicates() throws Exception {
ids = new HashSet<>();
String recordType = StorageStatusRecord.class.getSimpleName();
AggregationType aggregationType = AggregationType.MONTHLY;
Calendar start = Utility.getAggregationStartCalendar(2022, Calendar.JANUARY, 1);
Calendar end = Utility.getEndCalendarFromStartCalendar(aggregationType, start, 1);
Date aggregationStartDate = start.getTime();
Date aggregationEndDate = end.getTime();
aggregationInfo = new AggregationInfo(recordType, aggregationType, aggregationStartDate, aggregationEndDate);
FileSystemDirectoryStructure fileSystemDirectoryStructure = new FileSystemDirectoryStructure();
File elaborationDirectory = fileSystemDirectoryStructure.getTargetFolder(aggregationType, aggregationStartDate);
originalRecordsbackupFile = Elaborator.getOriginalRecordsBackupFile(elaborationDirectory, aggregationInfo);
noDuplicatesRecordsbackupFile = getAggregatedRecordsBackupFile();
noDuplicatesRecordsbackupFile.delete();
// readFile();
}
protected void elaborateLine(String line) throws Exception {
JsonNode jsonNode = DSMapper.asJsonNode(line);
String id = jsonNode.get(Record.ID).asText();
if(!ids.contains(id)) {
ids.add(id);
++uniqueLines;
Utility.printLine(noDuplicatesRecordsbackupFile, line);
}else {
logger.trace("Record with id {} was already found, it will be discarded.", id);
++discardedLines;
}
}
protected void readFile() throws Exception {
FileInputStream fstream = null;
DataInputStream in = null;
BufferedReader br = null;
try {
// Open the file that is the first // command line parameter
fstream = new FileInputStream(originalRecordsbackupFile);
// Get the object of DataInputStream
in = new DataInputStream(fstream);
br = new BufferedReader(new InputStreamReader(in));
readLines = 0;
discardedLines = 0;
uniqueLines = 0;
String line;
// Read File Line By Line
while((line = br.readLine()) != null) {
elaborateLine(line);
++readLines;
}
logger.info("Original records are {}. Unique records are {}. Discarded duplicates records are {}", readLines, uniqueLines, discardedLines);
Assert.assertTrue(readLines == (uniqueLines+discardedLines));
} finally {
if(br != null) {
br.close();
}
if(in != null) {
in.close();
}
if(fstream != null) {
fstream.close();
}
}
}
}