130 lines
4.3 KiB
Java
130 lines
4.3 KiB
Java
package org.gcube.accounting.aggregator.file;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.DataInputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.util.Calendar;
|
|
import java.util.Date;
|
|
import java.util.HashSet;
|
|
import java.util.Set;
|
|
|
|
import org.gcube.accounting.aggregator.ContextTest;
|
|
import org.gcube.accounting.aggregator.aggregation.AggregationInfo;
|
|
import org.gcube.accounting.aggregator.aggregation.AggregationType;
|
|
import org.gcube.accounting.aggregator.directory.FileSystemDirectoryStructure;
|
|
import org.gcube.accounting.aggregator.elaboration.Elaborator;
|
|
import org.gcube.accounting.aggregator.utility.Utility;
|
|
import org.gcube.accounting.datamodel.usagerecords.StorageStatusRecord;
|
|
import org.gcube.com.fasterxml.jackson.databind.JsonNode;
|
|
import org.gcube.documentstore.records.DSMapper;
|
|
import org.gcube.documentstore.records.Record;
|
|
import org.junit.Assert;
|
|
import org.junit.Ignore;
|
|
import org.junit.Test;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
/**
|
|
* @author Luca Frosini (ISTI - CNR)
|
|
* This class has been used to eliminates duplicates from a file
|
|
* because the aggregation was started concurrently twice.
|
|
*/
|
|
public class EliminateDuplicates extends ContextTest {
|
|
|
|
private static Logger logger = LoggerFactory.getLogger(EliminateDuplicates.class);
|
|
|
|
public final static String ORIGINAL_NO_DUPLICATES_SUFFIX = ".original-no-duplicates.json";
|
|
|
|
protected AggregationInfo aggregationInfo;
|
|
protected File originalRecordsbackupFile;
|
|
protected File noDuplicatesRecordsbackupFile;
|
|
|
|
protected int readLines;
|
|
protected int discardedLines;
|
|
protected int uniqueLines;
|
|
protected Set<String> ids;
|
|
|
|
protected File getAggregatedRecordsBackupFile() throws Exception {
|
|
File aggregateRecordsBackupFile = new File(originalRecordsbackupFile.getParentFile(),
|
|
originalRecordsbackupFile.getName().replace(Elaborator.ORIGINAL_SUFFIX, ORIGINAL_NO_DUPLICATES_SUFFIX));
|
|
return aggregateRecordsBackupFile;
|
|
}
|
|
|
|
@Ignore
|
|
@Test
|
|
public void eliminateDuplicates() throws Exception {
|
|
ids = new HashSet<>();
|
|
String recordType = StorageStatusRecord.class.getSimpleName();
|
|
AggregationType aggregationType = AggregationType.MONTHLY;
|
|
Calendar start = Utility.getAggregationStartCalendar(2022, Calendar.JANUARY, 1);
|
|
Calendar end = Utility.getEndCalendarFromStartCalendar(aggregationType, start, 1);
|
|
Date aggregationStartDate = start.getTime();
|
|
Date aggregationEndDate = end.getTime();
|
|
aggregationInfo = new AggregationInfo(recordType, aggregationType, aggregationStartDate, aggregationEndDate);
|
|
FileSystemDirectoryStructure fileSystemDirectoryStructure = new FileSystemDirectoryStructure();
|
|
File elaborationDirectory = fileSystemDirectoryStructure.getTargetFolder(aggregationType, aggregationStartDate);
|
|
originalRecordsbackupFile = Elaborator.getOriginalRecordsBackupFile(elaborationDirectory, aggregationInfo);
|
|
noDuplicatesRecordsbackupFile = getAggregatedRecordsBackupFile();
|
|
noDuplicatesRecordsbackupFile.delete();
|
|
// readFile();
|
|
}
|
|
|
|
protected void elaborateLine(String line) throws Exception {
|
|
JsonNode jsonNode = DSMapper.asJsonNode(line);
|
|
String id = jsonNode.get(Record.ID).asText();
|
|
if(!ids.contains(id)) {
|
|
ids.add(id);
|
|
++uniqueLines;
|
|
Utility.printLine(noDuplicatesRecordsbackupFile, line);
|
|
}else {
|
|
logger.trace("Record with id {} was already found, it will be discarded.", id);
|
|
++discardedLines;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
protected void readFile() throws Exception {
|
|
FileInputStream fstream = null;
|
|
DataInputStream in = null;
|
|
BufferedReader br = null;
|
|
try {
|
|
// Open the file that is the first // command line parameter
|
|
fstream = new FileInputStream(originalRecordsbackupFile);
|
|
// Get the object of DataInputStream
|
|
in = new DataInputStream(fstream);
|
|
br = new BufferedReader(new InputStreamReader(in));
|
|
|
|
readLines = 0;
|
|
discardedLines = 0;
|
|
uniqueLines = 0;
|
|
|
|
String line;
|
|
// Read File Line By Line
|
|
while((line = br.readLine()) != null) {
|
|
elaborateLine(line);
|
|
++readLines;
|
|
}
|
|
|
|
logger.info("Original records are {}. Unique records are {}. Discarded duplicates records are {}", readLines, uniqueLines, discardedLines);
|
|
|
|
Assert.assertTrue(readLines == (uniqueLines+discardedLines));
|
|
|
|
} finally {
|
|
if(br != null) {
|
|
br.close();
|
|
}
|
|
if(in != null) {
|
|
in.close();
|
|
}
|
|
if(fstream != null) {
|
|
fstream.close();
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
}
|