package org.gcube.dataanalysis.databases.sampler; import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.LinkedHashMap; import java.util.Random; import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger; import org.gcube.dataanalysis.databases.utils.ConnectionManager; import org.hibernate.SessionFactory; /** * Class that allows to perform different types of Sample operations on a table: * SampleOnTable, SmartSampleOnTable, RandomSampleOnTable */ public class Sampler { // query to perform sample operation on the table private static final String queryForSampleOnTablePostgres = "select %1$s from \"%2$s\" limit 100"; private static final String queryForSampleOnTableMysql = "select %1$s from %2$s limit 100"; // query to perform a smart sample operation randomly on the table private static final String queryForSmartSampleOnTablePostgres = "select %1$s from \"%2$s\" order by random() limit 200"; private static final String queryForSmartSampleOnTableMysql = "select %1$s from %2$s order by rand() limit 200"; // private static final String queryForSmartSampleOnTablePostgres = // "select * from \"%1$s\" order by random() limit 200"; // private static final String queryForSmartSampleOnTableMysql = // "select * from %1$s order by rand() limit 200"; // query to perform a smart sample operation on the table considering the // threshold private static final String queryForSmartSampleWithThresholdOnTablePostgres = "select %1$s from \"%2$s\" limit 200 offset %3$s"; // private static final String queryForSmartSampleWithThresholdOnTableMysql // = "select %1$s from %2$s limit 200 offset %3$s"; // query to perform a sample operation randomly on a table // private static final String queryForRandomSampleOnTablePostgres = // "select %1$s from \"%2$s\" order by random() limit 100"; // query to perform a smart sample operation on the table considering the // threshold private static final String queryForRandomSampleWithThresholdOnTablePostgres = "select %1$s from \"%2$s\" limit 100 offset %3$s"; private static final String queryForRandomSampleOnTableMysql = "select %1$s from %2$s order by rand() limit 100"; private static final String queryForRandomSampleOnTablePostgres = "select %1$s from %2$s order by random() limit 100"; // query to get columns' name private static final String queryForColumnsPostgres = "SELECT column_name FROM information_schema.COLUMNS WHERE table_name ='%1$s' and table_schema='%2$s'"; private static final String queryForColumnsMysql = "SELECT column_name FROM information_schema.COLUMNS WHERE table_name ='%1$s' and table_schema='%2$s'"; private static final String MYSQL = "MySQL"; private static final String POSTGRES = "Postgres"; private List listColumns = null; public Sampler() { } // retrieve the first 100 rows of a table public List sampleOnTable(ConnectionManager connection, SessionFactory dbSession, String DBType, String tableName, String schemaName, List DataTypeColumns) throws Exception { AnalysisLogger.getLogger().debug( "Sampler->starting the Sample on table operation"); AnalysisLogger.getLogger().debug( "Sampler->retrieving the first 100 rows"); // preparing the query to get the first 100 rows of a table List resultSet = null; String querySampleOnTable = null; // get a formatted list columns String listAttributes = null; listAttributes = getQuery(connection, dbSession, DBType, tableName, schemaName, DataTypeColumns); // preparing the query if (DBType.equals(POSTGRES)) { querySampleOnTable = String.format(queryForSampleOnTablePostgres, listAttributes, tableName); } if (DBType.equals(MYSQL)) { querySampleOnTable = String.format(queryForSampleOnTableMysql, listAttributes, tableName); } AnalysisLogger.getLogger() .debug("Sampler->preparing to submit the query: " + querySampleOnTable); resultSet = connection.executeQuery(querySampleOnTable, dbSession); AnalysisLogger.getLogger().debug( "Sampler->query submitted successfully"); if (resultSet == null) { AnalysisLogger .getLogger() .debug("Sampler->Error: The table has not rows. Sample operation not possible"); throw new Exception( "The resulting table has not rows. Sample operation not possible"); } // return the first 100 rows return resultSet; } // preparing the query to get the first 100 rows of a table private String getQuery(ConnectionManager connection, SessionFactory dbSession, String DBType, String tableName, String schemaName, List DataTypeColumns) throws Exception { // List listColumns = null; // get columns list listColumns = getListColumns(connection, dbSession, DBType, tableName, schemaName); // String querySampleOnTable = null; String listAttributes = null; if (listColumns != null) { // preparing the query to get the first 100 rows of a table // preparing the query with formatted list column names listAttributes = ""; String attribute = null; for (int i = 0; i < listColumns.size(); i++) { if (DBType.equals(POSTGRES)) { attribute = "CAST(" + listColumns.get(i) + " as text), "; if (i == (listColumns.size() - 1)) { attribute = "CAST(" + listColumns.get(i) + " as text)"; } } // for a value whose datatype is char or varchar a cast to utf8 // is performed while for other datatypes in order to return a // correct value a cast to binary and a second to char are // performed.(because a cast of large numerical values to char // are truncated) if (DBType.equals(MYSQL)) { if (DataTypeColumns.get(i).contains("char")) { // attribute = "CAST(" + listColumns.get(i) + // " as CHAR CHARACTER SET utf8), "; // attribute = "CONVERT(" + listColumns.get(i) + // ", CHAR), "; attribute = "CAST(" + listColumns.get(i) + " as CHAR CHARACTER SET utf8), "; if (i == (listColumns.size() - 1)) { // attribute = "CAST(" + listColumns.get(i) + // " as CHAR CHARACTER SET utf8)"; // attribute = "CONVERT(" + listColumns.get(i) + // ", BINARY)"; attribute = "CAST(" + listColumns.get(i) + " as CHAR CHARACTER SET utf8)"; } } else { attribute = "CAST(CAST(" + listColumns.get(i) + " as BINARY) as CHAR CHARACTER SET utf8), "; if (i == (listColumns.size() - 1)) { // attribute = "CAST(" + listColumns.get(i) + // " as CHAR CHARACTER SET utf8)"; // attribute = "CONVERT(" + listColumns.get(i) + // ", BINARY)"; attribute = "CAST(CAST(" + listColumns.get(i) + " as BINARY) as CHAR CHARACTER SET utf8)"; } } } listAttributes = listAttributes + attribute; } } return listAttributes; } // get list columns of a table private List getListColumns(ConnectionManager connection, SessionFactory dbSession, String DBType, String tableName, String schemaName) throws Exception { AnalysisLogger.getLogger().debug("Sampler->retrieving column names"); // preparing the query to get columns' names String queryColumns = null; // build the query for database postgres. The parameter "schemaName" is // the schema name. if (DBType.equals(POSTGRES)) { queryColumns = String.format(queryForColumnsPostgres, tableName, schemaName); } // build the query for database mysql. The parameter "schemaName" is the // database name. if (DBType.equals(MYSQL)) { queryColumns = String.format(queryForColumnsMysql, tableName, schemaName); } List columnsSet = null; List listColumns = null; columnsSet = connection.executeQuery(queryColumns, dbSession); AnalysisLogger.getLogger().debug( "Sampler->query submitted successfully: " + queryColumns); if (columnsSet != null) { listColumns = new ArrayList(); // //print check // AnalysisLogger.getLogger().debug( // "DatabaseManagement->size: " + columnsSet.size()); for (int i = 0; i < columnsSet.size(); i++) { Object element = columnsSet.get(i); // //print check // AnalysisLogger.getLogger().debug( // "Sampler->values: " + element); ArrayList listvalues = new ArrayList( ((LinkedHashMap) element).values()); // //print check // AnalysisLogger.getLogger().debug( // "Sampler->values: " + listvalues); listColumns.add((String) listvalues.get(0)); } } return listColumns; } // retrieve 100 rows of a table randomly that have the maximum number of // columns not null public List smartSampleOnTable(ConnectionManager connection, SessionFactory dbSession, String DBType, String tableName, String schemaName, long NumRows, List DataTypeColumns) throws Exception { AnalysisLogger.getLogger().debug( "Sampler->starting the Smart Sample on table operation"); if (NumRows == 0) { throw new Exception( "The table has not rows. Smart Sample operation not possible"); } // // computation of the iterations number // int NIterations = computeNumberIterations(NumRows); // // AnalysisLogger.getLogger().debug( // "Sampler->Iterations number: " + NIterations); // // // computation of the 100 rows randomly // // AnalysisLogger.getLogger().debug("Sampler->retrieving rows"); List rows = null; // if is rows number <= 700000 then the pure smart sample procedure is // performed otherwise a not pure smart sample procedure is performed in // order to solve a bug with the random function in postgres if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { // Postgres // compute the smart sample on a table rows = computeSmartSampleWithThreshold(connection, dbSession, DBType, tableName, schemaName, NumRows, DataTypeColumns); } else { // computation of the iterations number int NIterations = computeNumberIterations(NumRows); AnalysisLogger.getLogger().debug( "Sampler->Iterations number: " + NIterations); // computation of the 100 rows randomly AnalysisLogger.getLogger().debug("Sampler->retrieving rows"); // compute the smart sample on a table rows = computeSmartSample(connection, dbSession, DBType, tableName, schemaName, NIterations, DataTypeColumns, DataTypeColumns.size()); } // if ((NumRows <= 700000) && (DBType.equals(POSTGRES))) { // Postgres // // // computation of the iterations number // int NIterations = computeNumberIterations(NumRows); // // AnalysisLogger.getLogger().debug( // "Sampler->Iterations number: " + NIterations); // // // computation of the 100 rows randomly // // AnalysisLogger.getLogger().debug("Sampler->retrieving rows"); // // // compute the smart sample on a table // rows = computeSmartSample(connection, dbSession, DBType, tableName, // schemaName, NIterations, DataTypeColumns, // DataTypeColumns.size()); // // } // // else if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { // // Postgres // // compute the smart sample on a table // rows = computeSmartSampleWithThreshold(connection, dbSession, // DBType, tableName, schemaName, NumRows, DataTypeColumns); // // } else { // MySQL // // // computation of the iterations number // int NIterations = computeNumberIterations(NumRows); // // AnalysisLogger.getLogger().debug( // "Sampler->Iterations number: " + NIterations); // // // computation of the 100 rows randomly // // AnalysisLogger.getLogger().debug("Sampler->retrieving rows"); // // // compute the smart sample on a table // rows = computeSmartSample(connection, dbSession, DBType, tableName, // schemaName, NIterations, DataTypeColumns, // DataTypeColumns.size()); // // } if (rows == null) { AnalysisLogger .getLogger() .debug("Sampler->Error: the Smart Sample operation on table has not returned rows"); throw new Exception( "The Smart Sample operation on table has not returned rows"); } AnalysisLogger.getLogger().debug("Sampler->rows retrieved"); // return the first 100 rows return rows; } private int computeNumberIterations(long NumRows) { AnalysisLogger.getLogger().debug( "Sampler->processing iterations number"); AnalysisLogger.getLogger().debug("Sampler->rows number: " + NumRows); // build the formula k=(((-0.8)*NumRows)/10000)+1 double k = (((-0.8) * NumRows) / 10000) + 1.0; // if the the parameter k is negative, the sign must be changed double paramK = 0.0; if (Double.compare(k, 0.0) < 0) { paramK = k * (-1); } else { paramK = k; } AnalysisLogger.getLogger().debug( "Sampler->parameter K value: " + paramK); long NumElements = Math.min(NumRows, (long) 10000); AnalysisLogger.getLogger().debug( "Sampler->choosing the min value of elements: " + NumElements); // to build the formula NIterations=(k/200)*Nelementi double NumIterations = (paramK / 200) * NumElements; AnalysisLogger.getLogger().debug( "Sampler->iterations number: " + NumIterations); double Iterations = Math.max(Math.round(NumIterations), 1); AnalysisLogger.getLogger() .debug("Sampler-> choosing the max value of iterations: " + Iterations); double NumIts = Math.min(Iterations, 2.0); AnalysisLogger.getLogger().debug( "Sampler-> choosing the min value of iterations: " + NumIts); // to round the value (with a rint logic) return (int) (Math.rint(NumIts)); } // compute the SmartSampleTable. It extracts 200 rows randomly for each // iteration. Then it checks if the row with index equal to 100 has the max // score (that is equal to the columns' number). In this case the row list // is cut in order to return the first 100 rows. private List computeSmartSample(ConnectionManager connection, SessionFactory dbSession, String DBType, String tablename, String schemaName, int NIterations, List DataTypeColumns, int ColumnSize) throws Exception { List resultSet = null; String query = null; boolean removal = false; // map that contains for each row two information: the index // corresponding to the row and the score number that is the columns' // number with not null value // HashMap MapRows = new HashMap(); List listRows = new ArrayList(); // get a formatted list columns String listAttributes = null; listAttributes = getQuery(connection, dbSession, DBType, tablename, schemaName, DataTypeColumns); // compute score for each row of the table // build the query for database postgres if (DBType.equals(POSTGRES)) { query = String.format(queryForSmartSampleOnTablePostgres, listAttributes, tablename); } // build the query for database mysql if (DBType.equals(MYSQL)) { query = String.format(queryForSmartSampleOnTableMysql, listAttributes, tablename); } AnalysisLogger.getLogger().debug( "Sampler->building the query extracting 200 rows randomly"); Object[] columnArray = null; // define columns number with the threshold AnalysisLogger.getLogger().debug( "Sampler-> column array dimension: " + ColumnSize); double thresholdRank = ((ColumnSize) * 80); thresholdRank = thresholdRank / 100; double valCeil = Math.round(thresholdRank); AnalysisLogger.getLogger().debug( "Sampler-> number column generated by the threshold: " + thresholdRank + " rounded value: " + valCeil); // extract 200 rows randomly for each iteration extractionRows: for (int i = 0; i < NIterations; i++) { System.out.println("index iteration: " + i); AnalysisLogger.getLogger().debug( "Sampler->executing the query: " + query); resultSet = connection.executeQuery(query, dbSession); if (resultSet != null) { int numrows = resultSet.size(); AnalysisLogger.getLogger().debug( "Sampler->rows number: " + numrows); AnalysisLogger .getLogger() .debug("Sampler->computing the score and sort the row list in a reverse natural order"); // build the list with 200 rows for (int j = 0; j < numrows; j++) { Object element = resultSet.get(j); ArrayList listvalues = new ArrayList( ((LinkedHashMap) element).values()); columnArray = listvalues.toArray(); // compute the score for each row of the table int score = computeColumnScore(columnArray); // //check row score // AnalysisLogger.getLogger().debug( // "Sampler->row: " + j + " " + "score: " + score); RowScore rs = new RowScore(element, score); // insert the row in the list listRows.add(rs); // //check the sorting of a row // AnalysisLogger // .getLogger() // .debug("Sampler->sorting the list in a reverse natural order"); // sort by reverse natural order Collections.sort(listRows, Collections.reverseOrder()); // After each iteration there is a check to verify the row // (corresponding to the element with index=100). If the // score // of // this row is equal to the columns' number then the // previous // rows // in the list have all columns value not null // int size = listRows.size(); // to check if the row with index 100 has the max score // thresholdRank 80% if (listRows.size() >= 100) { int value = listRows.get(99).getScore(); // if (value == columnArray.length) { // AnalysisLogger.getLogger().debug( // "Sampler-> column array dimension: " // + columnArray.length); // // double thresholdRank = ((columnArray.length) * 80); // thresholdRank = thresholdRank / 100; // // double valCeil = Math.round(thresholdRank); // // AnalysisLogger.getLogger().debug( // "Sampler-> number column generated by the threshold: " // + thresholdRank // + " rounded value: " + valCeil); if (value >= (int) valCeil) { // //check row score // for (int k = 0; k < listRows.size(); k++) { // // // AnalysisLogger.getLogger().debug( // "Sampler->row with index: " + k // + " score " // + listRows.get(k).getScore()); // } removal = true; AnalysisLogger.getLogger().debug( "Sampler->row 100 with score: " + value); AnalysisLogger.getLogger().debug( "Sampler->starting the removal operation"); // Remove the elements from index 100 if the list's // size // is // greater than 100 if (listRows.size() > 100) { int numElemToDelete = listRows.size() - 100; AnalysisLogger.getLogger().debug( "Sampler->number of rows to delete: " + numElemToDelete); while (numElemToDelete != 0) { listRows.remove(100); numElemToDelete = numElemToDelete - 1; } } break extractionRows; } } } } else { return null; } } // Remove the elements from index 100 if the list's size is // greater than 100 and if this operation has not been done previously. if ((listRows.size() > 100) && (removal == false)) { for (int k = 0; k < listRows.size(); k++) { AnalysisLogger.getLogger().debug( "Sampler->row with index: " + k + " score " + listRows.get(k).getScore()); } AnalysisLogger.getLogger().debug( "Sampler->starting the removal operation"); int numElemToDelete = listRows.size() - 100; AnalysisLogger.getLogger().debug( "Sampler->number of rows to delete: " + numElemToDelete); // cut the list of rows in order to have only 100 rows while (numElemToDelete != 0) { RowScore row = listRows.remove(100); AnalysisLogger.getLogger().debug( "Sampler->removing row with score: " + row.getScore()); numElemToDelete = numElemToDelete - 1; } } // return the list of 100 rows List rows = new ArrayList(); AnalysisLogger.getLogger().debug( "Sampler->preparing the result (the row list): "); for (int i = 0; i < listRows.size(); i++) { // //check rows added in the final result // AnalysisLogger.getLogger().debug( // "Sampler->adding row with index: " + i); rows.add(listRows.get(i).getRow()); } return rows; } // compute the SmartSampleTable considering the threshold 700000 on the rows // number . It extracts 200 rows randomly for each // iteration. Then it checks if the row with index equal to 100 has the max // score (that is equal to the columns' number). In this case the row list // is cut in order to return the first 100 rows. private List computeSmartSampleWithThreshold( ConnectionManager connection, SessionFactory dbSession, String DBType, String tablename, String schemaName, long NumRows, List DataTypeColumns) throws Exception { // Define threshold int threshold = 700000; int X, Y; // Generate randomly two indexes used to execute two queries Random rn = new Random(); if ((threshold + 200) <= NumRows) { AnalysisLogger.getLogger().debug( "Sampler-> 700000+200 <= rows number"); X = rn.nextInt(threshold + 1) + 200; // generate a number in // range [200-700000] AnalysisLogger.getLogger().debug("Sampler->X index: " + X); // Generate a Y index // Define Lower and Upper Index (LI and UL) of a range int LI = X + 200; int UI = X - 200; AnalysisLogger.getLogger().debug( "Sampler->Lower Index of the range: " + LI); AnalysisLogger.getLogger().debug( "Sampler->Upper Index of the range: " + UI); int a; do { a = rn.nextInt(threshold + 1) + 0; } while (!((a < UI) || (a > LI))); Y = a; AnalysisLogger.getLogger().debug("Sampler->Y index: " + Y); } else { AnalysisLogger.getLogger().debug( "Sampler-> 700000+200 > rows number"); int offset = ((int) NumRows - threshold); int valForUpperIndex = 200 - offset; int UpperIndex = threshold - valForUpperIndex; // Generate an X index X = rn.nextInt(UpperIndex + 1) + 200; // generate a number in // range // [200-UpperIndex] AnalysisLogger.getLogger().debug("Sampler->X index: " + X); // Generate a Y index // Define Lower and Upper Index (LI and UL) of a range int LI = X + 200; int UI = X - 200; AnalysisLogger.getLogger().debug( "Sampler->Lower Index of the range: " + LI); AnalysisLogger.getLogger().debug( "Sampler->Upper Index of the range: " + UI); int a; do { a = rn.nextInt(UpperIndex + 1) + 0; } while (!((a < UI) || (a > LI))); Y = a; } int[] indexes = new int[2]; indexes[0] = X; indexes[1] = Y; // AnalysisLogger.getLogger().debug("Sampler->X index: " + indexes[0]); // AnalysisLogger.getLogger().debug("Sampler->Y index: " + indexes[1]); // start sample operation List resultSet = null; String query = null; boolean removal = false; // map that contains for each row two information: the index // corresponding to the row and the score number that is the columns' // number with not null value // HashMap MapRows = new HashMap(); List listRows = new ArrayList(); // get a formatted list columns String listAttributes = null; listAttributes = getQuery(connection, dbSession, DBType, tablename, schemaName, DataTypeColumns); // compute score for each row of the table // // build the query for database postgres // if (DBType.equals(POSTGRES)) { // // query = String.format( // queryForSmartSampleWithThresholdOnTablePostgres, // listAttributes, tablename); // // } // // build the query for database mysql // if (DBType.equals(MYSQL)) { // // query = String.format(queryForSmartSampleOnTableMysql, // listAttributes, tablename); // // } AnalysisLogger.getLogger().debug( "Sampler->building the query extracting 200 rows randomly"); Object[] columnArray = null; // extract 200 rows randomly for each iteration // Define the two queries.One query uses the X index, one query uses the // Y index. Each query extract 200 rows. // computation for the smart procedure extractionRows: for (int i = 0; i < 2; i++) { // build the query for database postgres if (DBType.equals(POSTGRES)) { query = String.format( queryForSmartSampleWithThresholdOnTablePostgres, listAttributes, tablename, indexes[i]); } AnalysisLogger.getLogger().debug( "Sampler->executing the query: " + query); resultSet = connection.executeQuery(query, dbSession); if (resultSet != null) { int numrows = resultSet.size(); AnalysisLogger.getLogger().debug( "Sampler->rows number: " + numrows); AnalysisLogger .getLogger() .debug("Sampler->computing the score and sorting the row list in a reverse natural order"); // build the list with 200 rows for (int j = 0; j < numrows; j++) { Object element = resultSet.get(j); ArrayList listvalues = new ArrayList( ((LinkedHashMap) element).values()); columnArray = listvalues.toArray(); // compute the score for each row of the table int score = computeColumnScore(columnArray); // //check row score // AnalysisLogger.getLogger().debug( // "Sampler->row: " + j + " " + "score: " + score); RowScore rs = new RowScore(element, score); // insert the row in the list listRows.add(rs); // //check the sorting of a row // AnalysisLogger // .getLogger() // .debug("Sampler->sorting the list in a reverse natural order"); // sort by reverse natural order Collections.sort(listRows, Collections.reverseOrder()); // After each iteration there is a check to verify the row // (corresponding to the element with index=100). If the // score // of // this row is equal to the columns' number then the // previous // rows // in the list have all columns value not null // int size = listRows.size(); // to check if the row with index 100 has the max score // threshold 80% if (listRows.size() >= 100) { int value = listRows.get(99).getScore(); // if (value == columnArray.length) { AnalysisLogger.getLogger().debug( "Sampler-> column array dimension: " + columnArray.length); double thresholdRank = ((columnArray.length) * 80); thresholdRank = thresholdRank / 100; double valCeil = Math.round(thresholdRank); AnalysisLogger.getLogger().debug( "Sampler-> threshold: " + thresholdRank + " rounded value: " + valCeil); if (value >= (int) valCeil) { // //check row score // for (int k = 0; k < listRows.size(); k++) { // // AnalysisLogger.getLogger().debug( // "Sampler->row with index: " + k // + " score " // + listRows.get(k).getScore()); // } removal = true; AnalysisLogger.getLogger().debug( "Sampler->row 100 with score: " + value); AnalysisLogger.getLogger().debug( "Sampler->starting the removal operation"); // Remove the elements from index 100 if the list's // size // is // greater than 100 if (listRows.size() > 100) { int numElemToDelete = listRows.size() - 100; AnalysisLogger.getLogger().debug( "Sampler->number of rows to delete: " + numElemToDelete); while (numElemToDelete != 0) { listRows.remove(100); numElemToDelete = numElemToDelete - 1; } } break extractionRows; } } } } else { return null; } } // Remove the elements from index 100 if the list's size is // greater than 100 and if this operation has not been done previously. if ((listRows.size() > 100) && (removal == false)) { // check score of the row list // for (int k = 0; k < listRows.size(); k++) { // // AnalysisLogger.getLogger().debug( // "Sampler->row with index: " + k + " score " // + listRows.get(k).getScore()); // } AnalysisLogger.getLogger().debug( "Sampler->starting the removal operation"); int numElemToDelete = listRows.size() - 100; AnalysisLogger.getLogger().debug( "Sampler->number of rows to delete: " + numElemToDelete); // cut the list of rows in order to have only 100 rows while (numElemToDelete != 0) { RowScore row = listRows.remove(100); AnalysisLogger.getLogger().debug( "Sampler->removing row with score: " + row.getScore()); numElemToDelete = numElemToDelete - 1; } } // return the list of 100 rows List rows = new ArrayList(); AnalysisLogger.getLogger().debug( "Sampler->preparing the result (the row list): "); for (int i = 0; i < listRows.size(); i++) { // //check the row list result // AnalysisLogger.getLogger().debug( // "Sampler->adding row with index: " + i); rows.add(listRows.get(i).getRow()); } return rows; } // compute the score for each array (the score is the number of table // columns with value not null and not empty) private int computeColumnScore(Object[] columnArray) { int score = 0; for (int i = 0; i < columnArray.length; i++) { if (columnArray[i] != null) { if (!(columnArray[i].toString().equals(""))) { score++; } } } return score; } // // retrieve 100 rows of a table randomly // public List randomSampleOnTable(ConnectionManager connection, // SessionFactory dbSession, String DBType, String tableName, // String schemaName, List DataTypeColumns) throws Exception { // // AnalysisLogger.getLogger().debug( // "Sampler->starting the Random Sample on table operation"); // // AnalysisLogger.getLogger().debug("Sampler->retrieving the 100 rows"); // // // preparing the query to get the first 100 rows of a table // // List resultSet = null; // // String querySampleOnTable = null; // // // get a formatted list columns // // String listAttributes = null; // listAttributes = getQuery(connection, dbSession, DBType, tableName, // schemaName, DataTypeColumns); // // // preparing the query // // if (DBType.equals(POSTGRES)) { // // querySampleOnTable = String.format( // queryForRandomSampleOnTablePostgres, listAttributes, // tableName); // // } // // if (DBType.equals(MYSQL)) { // // querySampleOnTable = String // .format(queryForRandomSampleOnTableMysql, listAttributes, // tableName); // // } // // AnalysisLogger.getLogger() // .debug("Sampler->preparing to submit the query: " // + querySampleOnTable); // // resultSet = connection.executeQuery(querySampleOnTable, dbSession); // // AnalysisLogger.getLogger().debug( // "Sampler->query submitted successfully"); // // if (resultSet == null) { // AnalysisLogger // .getLogger() // .debug("Sampler->Error: The resulting table has not rows. Sample operation not possible"); // // throw new Exception( // "The resulting table has not rows. Sample operation not possible"); // // } // // // return the first 100 rows // return resultSet; // // } // retrieve 100 rows of a table randomly public List randomSampleOnTable(ConnectionManager connection, SessionFactory dbSession, String DBType, String tableName, String schemaName, long NumRows, List DataTypeColumns) throws Exception { AnalysisLogger.getLogger().debug( "Sampler->starting the Random Sample on table operation"); AnalysisLogger.getLogger().debug("Sampler->retrieving the 100 rows"); // preparing the query to get the first 100 rows of a table List resultSet = null; String querySampleOnTable = null; // get a formatted list columns String listAttributes = null; listAttributes = getQuery(connection, dbSession, DBType, tableName, schemaName, DataTypeColumns); // preparing the query // if is rows number <= 700000 then the pure random sample procedure is // performed otherwise a not pure ranom sample procedure is performed in // order to solve a bug with the random function in postgres if ((NumRows <= 700000) && (DBType.equals(POSTGRES))) { // Postgres querySampleOnTable = String.format( queryForRandomSampleOnTablePostgres, listAttributes, tableName); } if ((NumRows > 700000) && (DBType.equals(POSTGRES))) { // Postgres // generate an index randomly to execute the query // Define threshold int threshold = 700000; int X; // generate an index used to execute the query Random rn = new Random(); if ((threshold + 100) <= NumRows) { X = rn.nextInt(threshold + 1) + 100; // generate a number in // range [100-700000] AnalysisLogger.getLogger().debug("Sampler->X index: " + X); } else { AnalysisLogger.getLogger().debug( "Sampler-> 700000+100 > rows number"); int offset = ((int) NumRows - threshold); int valForUpperIndex = 100 - offset; int UpperIndex = threshold - valForUpperIndex; // Generate an X index X = rn.nextInt(UpperIndex + 1) + 100; // generate a number in // range // [100-UpperIndex] AnalysisLogger.getLogger().debug("Sampler->X index: " + X); } querySampleOnTable = String.format( queryForRandomSampleWithThresholdOnTablePostgres, listAttributes, tableName, X); } if (DBType.equals(MYSQL)) { // MySQL querySampleOnTable = String .format(queryForRandomSampleOnTableMysql, listAttributes, tableName); } AnalysisLogger.getLogger() .debug("Sampler->preparing to submit the query: " + querySampleOnTable); resultSet = connection.executeQuery(querySampleOnTable, dbSession); AnalysisLogger.getLogger().debug( "Sampler->query submitted successfully"); if (resultSet == null) { AnalysisLogger .getLogger() .debug("Sampler->Error: The resulting table has not rows. Sample operation not possible"); throw new Exception( "The resulting table has not rows. Sample operation not possible"); } // return the first 100 rows return resultSet; } // to retrieve the columns names of a table public List getListColumns() { return listColumns; } }