1287 lines
34 KiB
Java
1287 lines
34 KiB
Java
package org.gcube.dataanalysis.databases.sampler;
|
|
|
|
import java.util.ArrayList;
|
|
import java.util.Collections;
|
|
import java.util.List;
|
|
import java.util.LinkedHashMap;
|
|
import java.util.Random;
|
|
|
|
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
|
|
import org.gcube.dataanalysis.databases.utils.ConnectionManager;
|
|
import org.hibernate.SessionFactory;
|
|
|
|
/**
|
|
* Class that allows to perform different types of Sample operations on a table:
|
|
* SampleOnTable, SmartSampleOnTable, RandomSampleOnTable
|
|
*/
|
|
|
|
public class Sampler {
|
|
|
|
// query to perform sample operation on the table
|
|
private static final String queryForSampleOnTablePostgres = "select %1$s from \"%2$s\" limit 100";
|
|
private static final String queryForSampleOnTableMysql = "select %1$s from %2$s limit 100";
|
|
|
|
// query to perform a smart sample operation randomly on the table
|
|
private static final String queryForSmartSampleOnTablePostgres = "select %1$s from \"%2$s\" order by random() limit 200";
|
|
private static final String queryForSmartSampleOnTableMysql = "select %1$s from %2$s order by rand() limit 200";
|
|
// private static final String queryForSmartSampleOnTablePostgres =
|
|
// "select * from \"%1$s\" order by random() limit 200";
|
|
// private static final String queryForSmartSampleOnTableMysql =
|
|
// "select * from %1$s order by rand() limit 200";
|
|
|
|
// query to perform a smart sample operation on the table considering the
|
|
// threshold
|
|
private static final String queryForSmartSampleWithThresholdOnTablePostgres = "select %1$s from \"%2$s\" limit 200 offset %3$s";
|
|
// private static final String queryForSmartSampleWithThresholdOnTableMysql
|
|
// = "select %1$s from %2$s limit 200 offset %3$s";
|
|
|
|
// query to perform a sample operation randomly on a table
|
|
// private static final String queryForRandomSampleOnTablePostgres =
|
|
// "select %1$s from \"%2$s\" order by random() limit 100";
|
|
// query to perform a smart sample operation on the table considering the
|
|
// threshold
|
|
private static final String queryForRandomSampleWithThresholdOnTablePostgres = "select %1$s from \"%2$s\" limit 100 offset %3$s";
|
|
private static final String queryForRandomSampleOnTableMysql = "select %1$s from %2$s order by rand() limit 100";
|
|
private static final String queryForRandomSampleOnTablePostgres = "select %1$s from %2$s order by random() limit 100";
|
|
|
|
// query to get columns' name
|
|
private static final String queryForColumnsPostgres = "SELECT column_name FROM information_schema.COLUMNS WHERE table_name ='%1$s' and table_schema='%2$s'";
|
|
private static final String queryForColumnsMysql = "SELECT column_name FROM information_schema.COLUMNS WHERE table_name ='%1$s' and table_schema='%2$s'";
|
|
|
|
private static final String MYSQL = "MySQL";
|
|
private static final String POSTGRES = "Postgres";
|
|
|
|
private List<String> listColumns = null;
|
|
|
|
public Sampler() {
|
|
|
|
}
|
|
|
|
// retrieve the first 100 rows of a table
|
|
public List<Object> sampleOnTable(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tableName,
|
|
String schemaName, List<String> DataTypeColumns) throws Exception {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the Sample on table operation");
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->retrieving the first 100 rows");
|
|
|
|
// preparing the query to get the first 100 rows of a table
|
|
|
|
List<Object> resultSet = null;
|
|
|
|
String querySampleOnTable = null;
|
|
|
|
// get a formatted list columns
|
|
|
|
String listAttributes = null;
|
|
listAttributes = getQuery(connection, dbSession, DBType, tableName,
|
|
schemaName, DataTypeColumns);
|
|
|
|
// preparing the query
|
|
|
|
if (DBType.equals(POSTGRES)) {
|
|
|
|
querySampleOnTable = String.format(queryForSampleOnTablePostgres,
|
|
listAttributes, tableName);
|
|
|
|
}
|
|
|
|
if (DBType.equals(MYSQL)) {
|
|
|
|
querySampleOnTable = String.format(queryForSampleOnTableMysql,
|
|
listAttributes, tableName);
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger()
|
|
.debug("Sampler->preparing to submit the query: "
|
|
+ querySampleOnTable);
|
|
|
|
resultSet = connection.executeQuery(querySampleOnTable, dbSession);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->query submitted successfully");
|
|
|
|
if (resultSet == null) {
|
|
AnalysisLogger
|
|
.getLogger()
|
|
.debug("Sampler->Error: The table has not rows. Sample operation not possible");
|
|
|
|
throw new Exception(
|
|
"The resulting table has not rows. Sample operation not possible");
|
|
|
|
}
|
|
|
|
// return the first 100 rows
|
|
return resultSet;
|
|
|
|
}
|
|
|
|
// preparing the query to get the first 100 rows of a table
|
|
private String getQuery(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tableName,
|
|
String schemaName, List<String> DataTypeColumns) throws Exception {
|
|
|
|
// List<String> listColumns = null;
|
|
|
|
// get columns list
|
|
listColumns = getListColumns(connection, dbSession, DBType, tableName,
|
|
schemaName);
|
|
|
|
// String querySampleOnTable = null;
|
|
String listAttributes = null;
|
|
|
|
if (listColumns != null) {
|
|
|
|
// preparing the query to get the first 100 rows of a table
|
|
|
|
// preparing the query with formatted list column names
|
|
|
|
listAttributes = "";
|
|
String attribute = null;
|
|
|
|
for (int i = 0; i < listColumns.size(); i++) {
|
|
|
|
if (DBType.equals(POSTGRES)) {
|
|
|
|
attribute = "CAST(" + listColumns.get(i) + " as text), ";
|
|
|
|
if (i == (listColumns.size() - 1)) {
|
|
|
|
attribute = "CAST(" + listColumns.get(i) + " as text)";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// for a value whose datatype is char or varchar a cast to utf8
|
|
// is performed while for other datatypes in order to return a
|
|
// correct value a cast to binary and a second to char are
|
|
// performed.(because a cast of large numerical values to char
|
|
// are truncated)
|
|
|
|
if (DBType.equals(MYSQL)) {
|
|
|
|
if (DataTypeColumns.get(i).contains("char")) {
|
|
|
|
// attribute = "CAST(" + listColumns.get(i) +
|
|
// " as CHAR CHARACTER SET utf8), ";
|
|
// attribute = "CONVERT(" + listColumns.get(i) +
|
|
// ", CHAR), ";
|
|
attribute = "CAST(" + listColumns.get(i)
|
|
+ " as CHAR CHARACTER SET utf8), ";
|
|
|
|
if (i == (listColumns.size() - 1)) {
|
|
|
|
// attribute = "CAST(" + listColumns.get(i) +
|
|
// " as CHAR CHARACTER SET utf8)";
|
|
|
|
// attribute = "CONVERT(" + listColumns.get(i) +
|
|
// ", BINARY)";
|
|
|
|
attribute = "CAST(" + listColumns.get(i)
|
|
+ " as CHAR CHARACTER SET utf8)";
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
attribute = "CAST(CAST(" + listColumns.get(i)
|
|
+ " as BINARY) as CHAR CHARACTER SET utf8), ";
|
|
|
|
if (i == (listColumns.size() - 1)) {
|
|
|
|
// attribute = "CAST(" + listColumns.get(i) +
|
|
// " as CHAR CHARACTER SET utf8)";
|
|
|
|
// attribute = "CONVERT(" + listColumns.get(i) +
|
|
// ", BINARY)";
|
|
|
|
attribute = "CAST(CAST(" + listColumns.get(i)
|
|
+ " as BINARY) as CHAR CHARACTER SET utf8)";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
listAttributes = listAttributes + attribute;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return listAttributes;
|
|
|
|
}
|
|
|
|
// get list columns of a table
|
|
private List<String> getListColumns(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tableName,
|
|
String schemaName) throws Exception {
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->retrieving column names");
|
|
|
|
// preparing the query to get columns' names
|
|
String queryColumns = null;
|
|
|
|
// build the query for database postgres. The parameter "schemaName" is
|
|
// the schema name.
|
|
if (DBType.equals(POSTGRES)) {
|
|
|
|
queryColumns = String.format(queryForColumnsPostgres, tableName,
|
|
schemaName);
|
|
|
|
}
|
|
|
|
// build the query for database mysql. The parameter "schemaName" is the
|
|
// database name.
|
|
if (DBType.equals(MYSQL)) {
|
|
|
|
queryColumns = String.format(queryForColumnsMysql, tableName,
|
|
schemaName);
|
|
|
|
}
|
|
|
|
List<Object> columnsSet = null;
|
|
List<String> listColumns = null;
|
|
|
|
columnsSet = connection.executeQuery(queryColumns, dbSession);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->query submitted successfully: " + queryColumns);
|
|
|
|
if (columnsSet != null) {
|
|
|
|
listColumns = new ArrayList<String>();
|
|
|
|
// //print check
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "DatabaseManagement->size: " + columnsSet.size());
|
|
|
|
for (int i = 0; i < columnsSet.size(); i++) {
|
|
|
|
Object element = columnsSet.get(i);
|
|
|
|
// //print check
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->values: " + element);
|
|
|
|
ArrayList<Object> listvalues = new ArrayList<Object>(
|
|
((LinkedHashMap<String, Object>) element).values());
|
|
|
|
// //print check
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->values: " + listvalues);
|
|
|
|
listColumns.add((String) listvalues.get(0));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return listColumns;
|
|
|
|
}
|
|
|
|
// retrieve 100 rows of a table randomly that have the maximum number of
|
|
// columns not null
|
|
public List<Object> smartSampleOnTable(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tableName,
|
|
String schemaName, long NumRows, List<String> DataTypeColumns)
|
|
throws Exception {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the Smart Sample on table operation");
|
|
|
|
if (NumRows == 0) {
|
|
throw new Exception(
|
|
"The table has not rows. Smart Sample operation not possible");
|
|
}
|
|
|
|
// // computation of the iterations number
|
|
// int NIterations = computeNumberIterations(NumRows);
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->Iterations number: " + NIterations);
|
|
//
|
|
// // computation of the 100 rows randomly
|
|
//
|
|
// AnalysisLogger.getLogger().debug("Sampler->retrieving rows");
|
|
|
|
List<Object> rows = null;
|
|
|
|
// if is rows number <= 700000 then the pure smart sample procedure is
|
|
// performed otherwise a not pure smart sample procedure is performed in
|
|
// order to solve a bug with the random function in postgres
|
|
|
|
if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { // Postgres
|
|
// compute the smart sample on a table
|
|
rows = computeSmartSampleWithThreshold(connection, dbSession,
|
|
DBType, tableName, schemaName, NumRows, DataTypeColumns);
|
|
|
|
} else {
|
|
|
|
// computation of the iterations number
|
|
int NIterations = computeNumberIterations(NumRows);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->Iterations number: " + NIterations);
|
|
|
|
// computation of the 100 rows randomly
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->retrieving rows");
|
|
|
|
// compute the smart sample on a table
|
|
rows = computeSmartSample(connection, dbSession, DBType, tableName,
|
|
schemaName, NIterations, DataTypeColumns,
|
|
DataTypeColumns.size());
|
|
|
|
}
|
|
|
|
// if ((NumRows <= 700000) && (DBType.equals(POSTGRES))) { // Postgres
|
|
//
|
|
// // computation of the iterations number
|
|
// int NIterations = computeNumberIterations(NumRows);
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->Iterations number: " + NIterations);
|
|
//
|
|
// // computation of the 100 rows randomly
|
|
//
|
|
// AnalysisLogger.getLogger().debug("Sampler->retrieving rows");
|
|
//
|
|
// // compute the smart sample on a table
|
|
// rows = computeSmartSample(connection, dbSession, DBType, tableName,
|
|
// schemaName, NIterations, DataTypeColumns,
|
|
// DataTypeColumns.size());
|
|
//
|
|
// }
|
|
//
|
|
// else if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { //
|
|
// Postgres
|
|
// // compute the smart sample on a table
|
|
// rows = computeSmartSampleWithThreshold(connection, dbSession,
|
|
// DBType, tableName, schemaName, NumRows, DataTypeColumns);
|
|
//
|
|
// } else { // MySQL
|
|
//
|
|
// // computation of the iterations number
|
|
// int NIterations = computeNumberIterations(NumRows);
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->Iterations number: " + NIterations);
|
|
//
|
|
// // computation of the 100 rows randomly
|
|
//
|
|
// AnalysisLogger.getLogger().debug("Sampler->retrieving rows");
|
|
//
|
|
// // compute the smart sample on a table
|
|
// rows = computeSmartSample(connection, dbSession, DBType, tableName,
|
|
// schemaName, NIterations, DataTypeColumns,
|
|
// DataTypeColumns.size());
|
|
//
|
|
// }
|
|
|
|
if (rows == null) {
|
|
|
|
AnalysisLogger
|
|
.getLogger()
|
|
.debug("Sampler->Error: the Smart Sample operation on table has not returned rows");
|
|
|
|
throw new Exception(
|
|
"The Smart Sample operation on table has not returned rows");
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->rows retrieved");
|
|
|
|
// return the first 100 rows
|
|
return rows;
|
|
|
|
}
|
|
|
|
private int computeNumberIterations(long NumRows) {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->processing iterations number");
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->rows number: " + NumRows);
|
|
|
|
// build the formula k=(((-0.8)*NumRows)/10000)+1
|
|
double k = (((-0.8) * NumRows) / 10000) + 1.0;
|
|
|
|
// if the the parameter k is negative, the sign must be changed
|
|
double paramK = 0.0;
|
|
if (Double.compare(k, 0.0) < 0) {
|
|
|
|
paramK = k * (-1);
|
|
|
|
} else {
|
|
paramK = k;
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->parameter K value: " + paramK);
|
|
|
|
long NumElements = Math.min(NumRows, (long) 10000);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->choosing the min value of elements: " + NumElements);
|
|
|
|
// to build the formula NIterations=(k/200)*Nelementi
|
|
|
|
double NumIterations = (paramK / 200) * NumElements;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->iterations number: " + NumIterations);
|
|
|
|
double Iterations = Math.max(Math.round(NumIterations), 1);
|
|
|
|
AnalysisLogger.getLogger()
|
|
.debug("Sampler-> choosing the max value of iterations: "
|
|
+ Iterations);
|
|
|
|
double NumIts = Math.min(Iterations, 2.0);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> choosing the min value of iterations: " + NumIts);
|
|
|
|
// to round the value (with a rint logic)
|
|
return (int) (Math.rint(NumIts));
|
|
|
|
}
|
|
|
|
// compute the SmartSampleTable. It extracts 200 rows randomly for each
|
|
// iteration. Then it checks if the row with index equal to 100 has the max
|
|
// score (that is equal to the columns' number). In this case the row list
|
|
// is cut in order to return the first 100 rows.
|
|
private List<Object> computeSmartSample(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tablename,
|
|
String schemaName, int NIterations, List<String> DataTypeColumns,
|
|
int ColumnSize) throws Exception {
|
|
|
|
List<Object> resultSet = null;
|
|
|
|
String query = null;
|
|
|
|
boolean removal = false;
|
|
|
|
// map that contains for each row two information: the index
|
|
// corresponding to the row and the score number that is the columns'
|
|
// number with not null value
|
|
// HashMap<Integer, Integer> MapRows = new HashMap<Integer, Integer>();
|
|
|
|
List<RowScore> listRows = new ArrayList<RowScore>();
|
|
|
|
// get a formatted list columns
|
|
|
|
String listAttributes = null;
|
|
listAttributes = getQuery(connection, dbSession, DBType, tablename,
|
|
schemaName, DataTypeColumns);
|
|
|
|
// compute score for each row of the table
|
|
|
|
// build the query for database postgres
|
|
if (DBType.equals(POSTGRES)) {
|
|
|
|
query = String.format(queryForSmartSampleOnTablePostgres,
|
|
listAttributes, tablename);
|
|
|
|
}
|
|
// build the query for database mysql
|
|
if (DBType.equals(MYSQL)) {
|
|
|
|
query = String.format(queryForSmartSampleOnTableMysql,
|
|
listAttributes, tablename);
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->building the query extracting 200 rows randomly");
|
|
|
|
Object[] columnArray = null;
|
|
|
|
// define columns number with the threshold
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> column array dimension: " + ColumnSize);
|
|
|
|
double thresholdRank = ((ColumnSize) * 80);
|
|
thresholdRank = thresholdRank / 100;
|
|
|
|
double valCeil = Math.round(thresholdRank);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> number column generated by the threshold: "
|
|
+ thresholdRank + " rounded value: " + valCeil);
|
|
|
|
// extract 200 rows randomly for each iteration
|
|
extractionRows: for (int i = 0; i < NIterations; i++) {
|
|
|
|
System.out.println("index iteration: " + i);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->executing the query: " + query);
|
|
|
|
resultSet = connection.executeQuery(query, dbSession);
|
|
|
|
if (resultSet != null) {
|
|
|
|
int numrows = resultSet.size();
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->rows number: " + numrows);
|
|
|
|
AnalysisLogger
|
|
.getLogger()
|
|
.debug("Sampler->computing the score and sort the row list in a reverse natural order");
|
|
|
|
// build the list with 200 rows
|
|
for (int j = 0; j < numrows; j++) {
|
|
|
|
Object element = resultSet.get(j);
|
|
|
|
ArrayList<Object> listvalues = new ArrayList<Object>(
|
|
((LinkedHashMap<String, Object>) element).values());
|
|
|
|
columnArray = listvalues.toArray();
|
|
|
|
// compute the score for each row of the table
|
|
int score = computeColumnScore(columnArray);
|
|
|
|
// //check row score
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->row: " + j + " " + "score: " + score);
|
|
|
|
RowScore rs = new RowScore(element, score);
|
|
|
|
// insert the row in the list
|
|
listRows.add(rs);
|
|
|
|
// //check the sorting of a row
|
|
// AnalysisLogger
|
|
// .getLogger()
|
|
// .debug("Sampler->sorting the list in a reverse natural order");
|
|
|
|
// sort by reverse natural order
|
|
Collections.sort(listRows, Collections.reverseOrder());
|
|
|
|
// After each iteration there is a check to verify the row
|
|
// (corresponding to the element with index=100). If the
|
|
// score
|
|
// of
|
|
// this row is equal to the columns' number then the
|
|
// previous
|
|
// rows
|
|
// in the list have all columns value not null
|
|
|
|
// int size = listRows.size();
|
|
|
|
// to check if the row with index 100 has the max score
|
|
|
|
// thresholdRank 80%
|
|
|
|
if (listRows.size() >= 100) {
|
|
int value = listRows.get(99).getScore();
|
|
|
|
// if (value == columnArray.length) {
|
|
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler-> column array dimension: "
|
|
// + columnArray.length);
|
|
//
|
|
// double thresholdRank = ((columnArray.length) * 80);
|
|
// thresholdRank = thresholdRank / 100;
|
|
//
|
|
// double valCeil = Math.round(thresholdRank);
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler-> number column generated by the threshold: "
|
|
// + thresholdRank
|
|
// + " rounded value: " + valCeil);
|
|
|
|
if (value >= (int) valCeil) {
|
|
|
|
// //check row score
|
|
// for (int k = 0; k < listRows.size(); k++) {
|
|
//
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->row with index: " + k
|
|
// + " score "
|
|
// + listRows.get(k).getScore());
|
|
// }
|
|
|
|
removal = true;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->row 100 with score: " + value);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the removal operation");
|
|
|
|
// Remove the elements from index 100 if the list's
|
|
// size
|
|
// is
|
|
// greater than 100
|
|
if (listRows.size() > 100) {
|
|
|
|
int numElemToDelete = listRows.size() - 100;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->number of rows to delete: "
|
|
+ numElemToDelete);
|
|
|
|
while (numElemToDelete != 0) {
|
|
|
|
listRows.remove(100);
|
|
|
|
numElemToDelete = numElemToDelete - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break extractionRows;
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// Remove the elements from index 100 if the list's size is
|
|
// greater than 100 and if this operation has not been done previously.
|
|
|
|
if ((listRows.size() > 100) && (removal == false)) {
|
|
|
|
for (int k = 0; k < listRows.size(); k++) {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->row with index: " + k + " score "
|
|
+ listRows.get(k).getScore());
|
|
}
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the removal operation");
|
|
|
|
int numElemToDelete = listRows.size() - 100;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->number of rows to delete: " + numElemToDelete);
|
|
|
|
// cut the list of rows in order to have only 100 rows
|
|
while (numElemToDelete != 0) {
|
|
|
|
RowScore row = listRows.remove(100);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->removing row with score: " + row.getScore());
|
|
|
|
numElemToDelete = numElemToDelete - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// return the list of 100 rows
|
|
List<Object> rows = new ArrayList<Object>();
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->preparing the result (the row list): ");
|
|
|
|
for (int i = 0; i < listRows.size(); i++) {
|
|
|
|
// //check rows added in the final result
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->adding row with index: " + i);
|
|
|
|
rows.add(listRows.get(i).getRow());
|
|
|
|
}
|
|
|
|
return rows;
|
|
|
|
}
|
|
|
|
// compute the SmartSampleTable considering the threshold 700000 on the rows
|
|
// number . It extracts 200 rows randomly for each
|
|
// iteration. Then it checks if the row with index equal to 100 has the max
|
|
// score (that is equal to the columns' number). In this case the row list
|
|
// is cut in order to return the first 100 rows.
|
|
private List<Object> computeSmartSampleWithThreshold(
|
|
ConnectionManager connection, SessionFactory dbSession,
|
|
String DBType, String tablename, String schemaName, long NumRows,
|
|
List<String> DataTypeColumns) throws Exception {
|
|
|
|
// Define threshold
|
|
int threshold = 700000;
|
|
|
|
int X, Y;
|
|
|
|
// Generate randomly two indexes used to execute two queries
|
|
|
|
Random rn = new Random();
|
|
|
|
if ((threshold + 200) <= NumRows) {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> 700000+200 <= rows number");
|
|
|
|
X = rn.nextInt(threshold + 1) + 200; // generate a number in
|
|
// range [200-700000]
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->X index: " + X);
|
|
|
|
// Generate a Y index
|
|
|
|
// Define Lower and Upper Index (LI and UL) of a range
|
|
|
|
int LI = X + 200;
|
|
int UI = X - 200;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->Lower Index of the range: " + LI);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->Upper Index of the range: " + UI);
|
|
|
|
int a;
|
|
|
|
do {
|
|
|
|
a = rn.nextInt(threshold + 1) + 0;
|
|
|
|
} while (!((a < UI) || (a > LI)));
|
|
|
|
Y = a;
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->Y index: " + Y);
|
|
|
|
} else {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> 700000+200 > rows number");
|
|
int offset = ((int) NumRows - threshold);
|
|
int valForUpperIndex = 200 - offset;
|
|
int UpperIndex = threshold - valForUpperIndex;
|
|
|
|
// Generate an X index
|
|
X = rn.nextInt(UpperIndex + 1) + 200; // generate a number in
|
|
// range
|
|
// [200-UpperIndex]
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->X index: " + X);
|
|
|
|
// Generate a Y index
|
|
|
|
// Define Lower and Upper Index (LI and UL) of a range
|
|
|
|
int LI = X + 200;
|
|
int UI = X - 200;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->Lower Index of the range: " + LI);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->Upper Index of the range: " + UI);
|
|
|
|
int a;
|
|
|
|
do {
|
|
|
|
a = rn.nextInt(UpperIndex + 1) + 0;
|
|
|
|
} while (!((a < UI) || (a > LI)));
|
|
|
|
Y = a;
|
|
|
|
}
|
|
|
|
int[] indexes = new int[2];
|
|
|
|
indexes[0] = X;
|
|
indexes[1] = Y;
|
|
|
|
// AnalysisLogger.getLogger().debug("Sampler->X index: " + indexes[0]);
|
|
|
|
// AnalysisLogger.getLogger().debug("Sampler->Y index: " + indexes[1]);
|
|
|
|
// start sample operation
|
|
|
|
List<Object> resultSet = null;
|
|
|
|
String query = null;
|
|
|
|
boolean removal = false;
|
|
|
|
// map that contains for each row two information: the index
|
|
// corresponding to the row and the score number that is the columns'
|
|
// number with not null value
|
|
// HashMap<Integer, Integer> MapRows = new HashMap<Integer, Integer>();
|
|
|
|
List<RowScore> listRows = new ArrayList<RowScore>();
|
|
|
|
// get a formatted list columns
|
|
|
|
String listAttributes = null;
|
|
listAttributes = getQuery(connection, dbSession, DBType, tablename,
|
|
schemaName, DataTypeColumns);
|
|
|
|
// compute score for each row of the table
|
|
|
|
// // build the query for database postgres
|
|
// if (DBType.equals(POSTGRES)) {
|
|
//
|
|
// query = String.format(
|
|
// queryForSmartSampleWithThresholdOnTablePostgres,
|
|
// listAttributes, tablename);
|
|
//
|
|
// }
|
|
// // build the query for database mysql
|
|
// if (DBType.equals(MYSQL)) {
|
|
//
|
|
// query = String.format(queryForSmartSampleOnTableMysql,
|
|
// listAttributes, tablename);
|
|
//
|
|
// }
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->building the query extracting 200 rows randomly");
|
|
|
|
Object[] columnArray = null;
|
|
|
|
// extract 200 rows randomly for each iteration
|
|
|
|
// Define the two queries.One query uses the X index, one query uses the
|
|
// Y index. Each query extract 200 rows.
|
|
|
|
// computation for the smart procedure
|
|
|
|
extractionRows: for (int i = 0; i < 2; i++) {
|
|
|
|
// build the query for database postgres
|
|
if (DBType.equals(POSTGRES)) {
|
|
|
|
query = String.format(
|
|
queryForSmartSampleWithThresholdOnTablePostgres,
|
|
listAttributes, tablename, indexes[i]);
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->executing the query: " + query);
|
|
|
|
resultSet = connection.executeQuery(query, dbSession);
|
|
|
|
if (resultSet != null) {
|
|
|
|
int numrows = resultSet.size();
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->rows number: " + numrows);
|
|
|
|
AnalysisLogger
|
|
.getLogger()
|
|
.debug("Sampler->computing the score and sorting the row list in a reverse natural order");
|
|
|
|
// build the list with 200 rows
|
|
for (int j = 0; j < numrows; j++) {
|
|
|
|
Object element = resultSet.get(j);
|
|
|
|
ArrayList<Object> listvalues = new ArrayList<Object>(
|
|
((LinkedHashMap<String, Object>) element).values());
|
|
|
|
columnArray = listvalues.toArray();
|
|
|
|
// compute the score for each row of the table
|
|
int score = computeColumnScore(columnArray);
|
|
|
|
// //check row score
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->row: " + j + " " + "score: " + score);
|
|
|
|
RowScore rs = new RowScore(element, score);
|
|
|
|
// insert the row in the list
|
|
listRows.add(rs);
|
|
|
|
// //check the sorting of a row
|
|
// AnalysisLogger
|
|
// .getLogger()
|
|
// .debug("Sampler->sorting the list in a reverse natural order");
|
|
|
|
// sort by reverse natural order
|
|
Collections.sort(listRows, Collections.reverseOrder());
|
|
|
|
// After each iteration there is a check to verify the row
|
|
// (corresponding to the element with index=100). If the
|
|
// score
|
|
// of
|
|
// this row is equal to the columns' number then the
|
|
// previous
|
|
// rows
|
|
// in the list have all columns value not null
|
|
|
|
// int size = listRows.size();
|
|
|
|
// to check if the row with index 100 has the max score
|
|
|
|
// threshold 80%
|
|
|
|
if (listRows.size() >= 100) {
|
|
int value = listRows.get(99).getScore();
|
|
|
|
// if (value == columnArray.length) {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> column array dimension: "
|
|
+ columnArray.length);
|
|
|
|
double thresholdRank = ((columnArray.length) * 80);
|
|
thresholdRank = thresholdRank / 100;
|
|
|
|
double valCeil = Math.round(thresholdRank);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> threshold: " + thresholdRank
|
|
+ " rounded value: " + valCeil);
|
|
|
|
if (value >= (int) valCeil) {
|
|
|
|
// //check row score
|
|
// for (int k = 0; k < listRows.size(); k++) {
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->row with index: " + k
|
|
// + " score "
|
|
// + listRows.get(k).getScore());
|
|
// }
|
|
|
|
removal = true;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->row 100 with score: " + value);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the removal operation");
|
|
|
|
// Remove the elements from index 100 if the list's
|
|
// size
|
|
// is
|
|
// greater than 100
|
|
if (listRows.size() > 100) {
|
|
|
|
int numElemToDelete = listRows.size() - 100;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->number of rows to delete: "
|
|
+ numElemToDelete);
|
|
|
|
while (numElemToDelete != 0) {
|
|
|
|
listRows.remove(100);
|
|
|
|
numElemToDelete = numElemToDelete - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break extractionRows;
|
|
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
return null;
|
|
|
|
}
|
|
}
|
|
|
|
// Remove the elements from index 100 if the list's size is
|
|
// greater than 100 and if this operation has not been done previously.
|
|
|
|
if ((listRows.size() > 100) && (removal == false)) {
|
|
|
|
// check score of the row list
|
|
|
|
// for (int k = 0; k < listRows.size(); k++) {
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->row with index: " + k + " score "
|
|
// + listRows.get(k).getScore());
|
|
// }
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the removal operation");
|
|
|
|
int numElemToDelete = listRows.size() - 100;
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->number of rows to delete: " + numElemToDelete);
|
|
|
|
// cut the list of rows in order to have only 100 rows
|
|
while (numElemToDelete != 0) {
|
|
|
|
RowScore row = listRows.remove(100);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->removing row with score: " + row.getScore());
|
|
|
|
numElemToDelete = numElemToDelete - 1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// return the list of 100 rows
|
|
List<Object> rows = new ArrayList<Object>();
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->preparing the result (the row list): ");
|
|
|
|
for (int i = 0; i < listRows.size(); i++) {
|
|
|
|
// //check the row list result
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->adding row with index: " + i);
|
|
|
|
rows.add(listRows.get(i).getRow());
|
|
|
|
}
|
|
|
|
return rows;
|
|
|
|
}
|
|
|
|
// compute the score for each array (the score is the number of table
|
|
// columns with value not null and not empty)
|
|
private int computeColumnScore(Object[] columnArray) {
|
|
|
|
int score = 0;
|
|
|
|
for (int i = 0; i < columnArray.length; i++) {
|
|
|
|
if (columnArray[i] != null) {
|
|
|
|
if (!(columnArray[i].toString().equals(""))) {
|
|
|
|
score++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return score;
|
|
|
|
}
|
|
|
|
// // retrieve 100 rows of a table randomly
|
|
// public List<Object> randomSampleOnTable(ConnectionManager connection,
|
|
// SessionFactory dbSession, String DBType, String tableName,
|
|
// String schemaName, List<String> DataTypeColumns) throws Exception {
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->starting the Random Sample on table operation");
|
|
//
|
|
// AnalysisLogger.getLogger().debug("Sampler->retrieving the 100 rows");
|
|
//
|
|
// // preparing the query to get the first 100 rows of a table
|
|
//
|
|
// List<Object> resultSet = null;
|
|
//
|
|
// String querySampleOnTable = null;
|
|
//
|
|
// // get a formatted list columns
|
|
//
|
|
// String listAttributes = null;
|
|
// listAttributes = getQuery(connection, dbSession, DBType, tableName,
|
|
// schemaName, DataTypeColumns);
|
|
//
|
|
// // preparing the query
|
|
//
|
|
// if (DBType.equals(POSTGRES)) {
|
|
//
|
|
// querySampleOnTable = String.format(
|
|
// queryForRandomSampleOnTablePostgres, listAttributes,
|
|
// tableName);
|
|
//
|
|
// }
|
|
//
|
|
// if (DBType.equals(MYSQL)) {
|
|
//
|
|
// querySampleOnTable = String
|
|
// .format(queryForRandomSampleOnTableMysql, listAttributes,
|
|
// tableName);
|
|
//
|
|
// }
|
|
//
|
|
// AnalysisLogger.getLogger()
|
|
// .debug("Sampler->preparing to submit the query: "
|
|
// + querySampleOnTable);
|
|
//
|
|
// resultSet = connection.executeQuery(querySampleOnTable, dbSession);
|
|
//
|
|
// AnalysisLogger.getLogger().debug(
|
|
// "Sampler->query submitted successfully");
|
|
//
|
|
// if (resultSet == null) {
|
|
// AnalysisLogger
|
|
// .getLogger()
|
|
// .debug("Sampler->Error: The resulting table has not rows. Sample operation not possible");
|
|
//
|
|
// throw new Exception(
|
|
// "The resulting table has not rows. Sample operation not possible");
|
|
//
|
|
// }
|
|
//
|
|
// // return the first 100 rows
|
|
// return resultSet;
|
|
//
|
|
// }
|
|
|
|
// retrieve 100 rows of a table randomly
|
|
public List<Object> randomSampleOnTable(ConnectionManager connection,
|
|
SessionFactory dbSession, String DBType, String tableName,
|
|
String schemaName, long NumRows, List<String> DataTypeColumns)
|
|
throws Exception {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->starting the Random Sample on table operation");
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->retrieving the 100 rows");
|
|
|
|
// preparing the query to get the first 100 rows of a table
|
|
|
|
List<Object> resultSet = null;
|
|
|
|
String querySampleOnTable = null;
|
|
|
|
// get a formatted list columns
|
|
|
|
String listAttributes = null;
|
|
|
|
listAttributes = getQuery(connection, dbSession, DBType, tableName,
|
|
schemaName, DataTypeColumns);
|
|
|
|
// preparing the query
|
|
|
|
// if is rows number <= 700000 then the pure random sample procedure is
|
|
// performed otherwise a not pure ranom sample procedure is performed in
|
|
// order to solve a bug with the random function in postgres
|
|
|
|
if ((NumRows <= 700000) && (DBType.equals(POSTGRES))) { // Postgres
|
|
|
|
querySampleOnTable = String.format(
|
|
queryForRandomSampleOnTablePostgres, listAttributes,
|
|
tableName);
|
|
|
|
}
|
|
|
|
if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { // Postgres
|
|
|
|
// generate an index randomly to execute the query
|
|
|
|
// Define threshold
|
|
int threshold = 700000;
|
|
int X;
|
|
|
|
// generate an index used to execute the query
|
|
Random rn = new Random();
|
|
|
|
if ((threshold + 100) <= NumRows) {
|
|
|
|
X = rn.nextInt(threshold + 1) + 100; // generate a number in
|
|
// range [100-700000]
|
|
AnalysisLogger.getLogger().debug("Sampler->X index: " + X);
|
|
|
|
}
|
|
|
|
else {
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler-> 700000+100 > rows number");
|
|
|
|
int offset = ((int) NumRows - threshold);
|
|
int valForUpperIndex = 100 - offset;
|
|
int UpperIndex = threshold - valForUpperIndex;
|
|
|
|
// Generate an X index
|
|
X = rn.nextInt(UpperIndex + 1) + 100; // generate a number in
|
|
// range
|
|
// [100-UpperIndex]
|
|
|
|
AnalysisLogger.getLogger().debug("Sampler->X index: " + X);
|
|
|
|
}
|
|
|
|
querySampleOnTable = String.format(
|
|
queryForRandomSampleWithThresholdOnTablePostgres,
|
|
listAttributes, tableName, X);
|
|
|
|
}
|
|
|
|
if (DBType.equals(MYSQL)) { // MySQL
|
|
|
|
querySampleOnTable = String
|
|
.format(queryForRandomSampleOnTableMysql, listAttributes,
|
|
tableName);
|
|
|
|
}
|
|
|
|
AnalysisLogger.getLogger()
|
|
.debug("Sampler->preparing to submit the query: "
|
|
+ querySampleOnTable);
|
|
|
|
resultSet = connection.executeQuery(querySampleOnTable, dbSession);
|
|
|
|
AnalysisLogger.getLogger().debug(
|
|
"Sampler->query submitted successfully");
|
|
|
|
if (resultSet == null) {
|
|
AnalysisLogger
|
|
.getLogger()
|
|
.debug("Sampler->Error: The resulting table has not rows. Sample operation not possible");
|
|
|
|
throw new Exception(
|
|
"The resulting table has not rows. Sample operation not possible");
|
|
|
|
}
|
|
|
|
// return the first 100 rows
|
|
return resultSet;
|
|
|
|
}
|
|
|
|
// to retrieve the columns names of a table
|
|
public List<String> getListColumns() {
|
|
|
|
return listColumns;
|
|
|
|
}
|
|
|
|
}
|