- In case the Worker cannot be reached during a full-texts' batch request, abort the rest of the batches.
- Fix memory leaks when unzipping the batch-zip-file.
- Add explanatory comments for picking the database related to a full-text file.
logger.info("Going to create (if not exist) the database \""+databaseName+"\" and its tables. Also will fill some tables with data from database \""+initialDatabaseName+"\".");
privatevoidcreateDatabase()
{
if(isTestEnvironment){
logger.info("Going to create (if not exist) the database \""+testDatabaseName+"\" and its tables. Also will fill some tables with data from database \""+initialDatabaseName+"\".");
jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS "+testDatabaseName);
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+testDatabaseName+".publication stored as parquet as select * from "+initialDatabaseName+".publication");
jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS "+databaseName);
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+testDatabaseName+".publication_pids stored as parquet as select * from "+initialDatabaseName+".publication_pids");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName+".publication stored as parquet as select * from "+initialDatabaseName+".publication");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+testDatabaseName +".publication_urls stored as parquet as select * from " +initialDatabaseName+".publication_urls");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName +".publication_pids stored as parquet as select * from " +initialDatabaseName+".publication_pids");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+testDatabaseName +".datasource stored as parquet as select * from " +initialDatabaseName+".datasource");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName +".publication_urls stored as parquet as select * from "+initialDatabaseName+".publication_urls");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+testDatabaseName +".assignment (id string, original_url string, workerid string, `date` timestamp) stored as parquet");
databaseName=testDatabaseName;// For the rest of the queries.
}else
databaseName=initialDatabaseName;
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName+".assignment (id string, original_url string, workerid string, `date` timestamp) stored as parquet");
// For both cases, got check and create the tables which will be populated by the Controller.
// Drop the "current_assignment" table. It is a temporary table which is created on-demand during execution.
jdbcTemplate.execute("DROP TABLE IF EXISTS "+databaseName+".current_assignment PURGE");
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName+".attempt (id string, original_url string, `date` timestamp, status string, error_class string, error_message string) stored as parquet");
@ -73,7 +86,7 @@ public class ImpalaConnector {
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS "+databaseName+".payload (id string, original_url string, actual_url string, `date` timestamp, mimetype string, size string, `hash` string, `location` string, provenance string) stored as parquet");
"join "+ImpalaConnector.databaseName +".publication_urls pu on pu.id=p.id\n"+
"join "+ImpalaConnector.databaseName +".datasource d on d.id=p.datasourceid\n"+
"left outer join (select count(a.id) as counts, a.id from "+ImpalaConnector.databaseName +".attempt a group by a.id) as attempts on attempts.id=p.id\n"+
"left outer join (select a.id, a.original_url from "+ImpalaConnector.databaseName +".assignment a\n"+
"union all\n"+
"select pl.id, pl.original_url from "+ databaseName +".payload pl)\n"+
"select pl.id, pl.original_url from "+ImpalaConnector.databaseName +".payload pl)\n"+
"as existing on existing.id=p.id and existing.original_url=pu.url\n"+
"where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= "+maxAttemptsPerRecordAtomic.get()+
"\nand not exists (select 1 from "+ databaseName +".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n"+
"\nand not exists (select 1 from "+ImpalaConnector.databaseName +".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n"+
"limit "+(assignmentsLimit*10)+")\n"+
"as non_distinct_results\n"+
"order by coalesce(attempt_count, 0), reverse(pubid), url\n"+
@ -105,9 +101,9 @@ public class UrlController {
// The "order by" in the end makes sure the older attempted records will be re-attempted after a long time.
//logger.debug(findAssignmentsQuery); // DEBUG!
StringcreateCurrentAssignmentsQuery="create table "+ databaseName +".current_assignment as \n"+findAssignmentsQuery;
jdbcTemplate.execute("CREATE TABLE "+ImpalaConnector.databaseName +"."+tableName+"_tmp stored as parquet AS SELECT * FROM "+ImpalaConnector.databaseName +"."+tableName+" "+whereClause+parameter);
errorMsg="Problem when executing the \"clone-drop-rename\" queries!\n";
logger.error(errorMsg,e);
@ -115,7 +111,7 @@ public class FileUtils {
intnumFilesFoundFromPreviousAssignmentsBatches=0;
inturlReportsSize=urlReports.size();
HashMultimap<String,Payload>allFileNamesWithPayloads=HashMultimap.create((urlReportsSize/5),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
StringgetFileLocationForHashQuery="select `location` from "+ databaseName +".payload where `hash` = ? limit 1";
StringgetFileLocationForHashQuery="select `location` from "+ImpalaConnector.databaseName +".payload where `hash` = ? limit 1";
//logger.debug("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
@ -341,7 +354,12 @@ public class FileUtils {
returnnull;
}
}catch(Exceptione){
logger.warn("Problem when requesting the ZipFile of batch_"+batchNum+" of assignments_"+assignmentsBatchCounter+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl+"\n"+e.getMessage());
StringexMessage=e.getMessage();
logger.warn("Problem when requesting the ZipFile of batch_"+batchNum+" of assignments_"+assignmentsBatchCounter+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl+"\n"+exMessage);
if(exMessage.contains("Connection refused")){
logger.error("Since we received a \"Connection refused\", all of the remaining batches ("+(totalBatches-batchNum)+") will not be requested!");
# In case the "isTestEnvironment" is "true", the "testDatabase" below and its tables are created (if not exist).
# The tables "datasource", "publication", "publication_pids" and "publication_urls" are filled with the data from the same tables existing in the "initialDatabase".
# In case the "isTestEnvironment" is "false", the "initialDatabase" is used. The Controller assumes that the above 4 tables are present, and only creates the following tables:
# "assignment", "attempt" and "payload", which are populated during execution.