From 91f460ce5188490c572e5f8da7278515209dc0c3 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Fri, 28 Jan 2022 00:41:29 +0200 Subject: [PATCH 01/11] moved impala jar to omtd repository and updated build file --- build.gradle | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/build.gradle b/build.gradle index f9b87fe..d5c8dfb 100644 --- a/build.gradle +++ b/build.gradle @@ -11,8 +11,8 @@ sourceCompatibility = '1.8' repositories { mavenCentral() maven { - name "icm" - url "http://esperos.di.uoa.gr/repo" + name "omtd" + url "https://repo.openminted.eu/content/repositories/releases/" allowInsecureProtocol = true } maven { @@ -71,4 +71,4 @@ configurations { test { useJUnitPlatform() -} \ No newline at end of file +} From bf26bf955f74b764321cdbfdd1f09649e92a9cc8 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Sun, 30 Jan 2022 22:14:52 +0200 Subject: [PATCH 02/11] springified project --- build.gradle | 11 +- .../openaire/urls_controller/Application.java | 25 +- .../configuration/ImpalaConnector.java | 249 ++--------- .../controllers/ImpalaController.java | 37 +- .../controllers/TestController.java | 82 ++++ .../controllers/UrlController.java | 396 +++++------------- .../util/ControllerConstants.java | 10 - .../urls_controller/util/FileUnZipper.java | 20 +- .../urls_controller/util/FileUtils.java | 350 +++------------- .../urls_controller/util/S3ObjectStore.java | 141 +++++++ .../util/S3ObjectStoreMinIO.java | 226 ---------- .../urls_controller/util/TestFileUtils.java | 147 +++++++ .../urls_controller/util/UriBuilder.java | 135 +++--- src/main/resources/application.properties | 26 +- 14 files changed, 694 insertions(+), 1161 deletions(-) create mode 100644 src/main/java/eu/openaire/urls_controller/controllers/TestController.java delete mode 100644 src/main/java/eu/openaire/urls_controller/util/ControllerConstants.java create mode 100644 src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java delete mode 100644 src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java create mode 100644 src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java diff --git a/build.gradle b/build.gradle index d5c8dfb..bb8d274 100644 --- a/build.gradle +++ b/build.gradle @@ -49,10 +49,17 @@ dependencies { implementation 'io.minio:minio:8.3.5' // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp - implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio. + implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' + // This is required by the minio, as Spring uses a version which is not supported by minio. // https://mvnrepository.com/artifact/com.cloudera.impala/jdbc - implementation group: 'com.cloudera.impala', name: 'jdbc', version: '2.5.31' + implementation("com.cloudera.impala:jdbc:2.5.31") { + exclude group: 'org.slf4j', module: 'slf4j-log4j12' + exclude group: 'org.apache.derby', module: 'derby' + exclude group: 'org.eclipse.jetty.aggregate', module: 'jetty-all' + exclude group: 'log4j', module: 'log4j' + exclude group: 'log4j', module: 'apache-log4j-extras' + } testImplementation group: 'org.springframework.security', name: 'spring-security-test' testImplementation "org.springframework.boot:spring-boot-starter-test" diff --git a/src/main/java/eu/openaire/urls_controller/Application.java b/src/main/java/eu/openaire/urls_controller/Application.java index 4b06804..1803e0b 100644 --- a/src/main/java/eu/openaire/urls_controller/Application.java +++ b/src/main/java/eu/openaire/urls_controller/Application.java @@ -1,8 +1,5 @@ package eu.openaire.urls_controller; - -import eu.openaire.urls_controller.configuration.ImpalaConnector; -import eu.openaire.urls_controller.util.S3ObjectStoreMinIO; import eu.openaire.urls_controller.util.UriBuilder; import org.springframework.boot.CommandLineRunner; import org.springframework.boot.SpringApplication; @@ -14,7 +11,6 @@ import org.springframework.web.cors.CorsConfiguration; import org.springframework.web.cors.CorsConfigurationSource; import org.springframework.web.cors.UrlBasedCorsConfigurationSource; -import javax.annotation.PreDestroy; import java.util.Arrays; import java.util.Collections; @@ -23,7 +19,6 @@ import java.util.Collections; public class Application { public static void main(String[] args) { - new S3ObjectStoreMinIO(); SpringApplication.run(Application.class, args); } @@ -38,20 +33,10 @@ public class Application { source.registerCorsConfiguration("/**", configuration); return source; } - - - @PreDestroy - public static void preDestroy() - { - if ( ImpalaConnector.hikariDataSource != null ) - ImpalaConnector.hikariDataSource.close(); - } - - - @Bean - public CommandLineRunner setServerBaseUrl(Environment environment) - { - return args -> new UriBuilder(environment); - } +// +// @Bean +// public CommandLineRunner setServerBaseUrl(Environment environment) { +// return args -> new UriBuilder(environment); +// } } \ No newline at end of file diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index c0085e6..9d33dc4 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -1,255 +1,80 @@ package eu.openaire.urls_controller.configuration; -import com.zaxxer.hikari.HikariConfig; -import com.zaxxer.hikari.HikariDataSource; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.stereotype.Repository; -import java.beans.PropertyVetoException; -import java.io.File; -import java.io.FileReader; -import java.sql.*; -import java.util.Properties; +import javax.annotation.PostConstruct; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; - +@Repository public final class ImpalaConnector { private static final Logger logger = LoggerFactory.getLogger(ImpalaConnector.class); - public static String impalaDriver; - public static String impalaConnectionUrl; - public static String poolName; - public static int hikariMaxConnectionPoolSize; - public static int hikariMinIdleConnections; - public static int hikariConnectionTimeOut; - public static int hikariIdleTimeOut; - public static int hikariMaxLifetime; + @Autowired + private JdbcTemplate jdbcTemplate; - public static String oldDatabaseName = "pdfaggregation_i"; - public static String databaseName = "pdfAggregationDatabase"; + @Value("services.pdfaggregation.controller.db.oldDatabaseName:pdfaggregation_i") + public static String oldDatabaseName; + @Value("services.pdfaggregation.controller.db.databaseName:pdfAggregationDatabase") + public static String databaseName; public static final Lock databaseLock = new ReentrantLock(true); // This lock is locking the threads trying to execute queries in the database. - public static HikariDataSource hikariDataSource; - - private static final ImpalaConnector singletonObject = new ImpalaConnector(); - - public static ImpalaConnector getInstance() - { - return singletonObject; - } - - - public ImpalaConnector() - { + @PostConstruct + public void init() { logger.info("Max available memory to the Controller: " + Runtime.getRuntime().maxMemory() + " bytes."); try { - String dbSettingsPropertyFile = System.getProperty("user.dir") + File.separator + "src" + File.separator + "main" + File.separator + "resources" + File.separator + "application.properties"; - FileReader fReader = new FileReader(dbSettingsPropertyFile); - Properties props = new Properties(); - props.load(fReader); // Load jdbc related properties. - - // Get each property value. - impalaDriver = props.getProperty("spring.impala.driver-class-name"); - if ( !"".equals(impalaDriver) ) { // If not "null" or empty. - Class.forName(impalaDriver); - impalaConnectionUrl = props.getProperty("spring.impala.url"); - poolName = props.getProperty("spring.datasource.hikari.pool-name"); - hikariMaxConnectionPoolSize = Integer.parseInt(props.getProperty("spring.datasource.hikari.maximumPoolSize")); - hikariMaxLifetime = Integer.parseInt(props.getProperty("spring.datasource.hikari.maxLifetime")); - hikariMinIdleConnections = Integer.parseInt(props.getProperty("spring.datasource.hikari.minimumIdle")); - hikariConnectionTimeOut = Integer.parseInt(props.getProperty("spring.datasource.hikari.connectionTimeout")); - hikariIdleTimeOut = Integer.parseInt(props.getProperty("spring.datasource.hikari.idleTimeout")); - } else - throw new RuntimeException("The \"impalaDriver\" was null or empty!"); - } catch(Exception e) { - String errorMsg = "Error when loading the database properties!\n" + e.getMessage(); - logger.error(errorMsg, e); - System.err.println(errorMsg); - System.exit(11); - } - - try { - hikariDataSource = impalaDS(); - } catch (SQLException | PropertyVetoException e) { - logger.error("Problem when creating the Hikari connection pool!", e); + if ( jdbcTemplate.getDataSource().getConnection().getMetaData().supportsBatchUpdates() ) + logger.warn("The database does not support \"BatchUpdates\"!"); + } catch (Exception e) { + logger.error("Error testing if database supports batch updates", e); } createDatabase(); } - - public HikariDataSource impalaDS() throws SQLException, PropertyVetoException - { - HikariConfig hikariConfig = new HikariConfig(); - hikariConfig.setDriverClassName(ImpalaConnector.impalaDriver); - hikariConfig.setAutoCommit(true); - hikariConfig.setJdbcUrl(ImpalaConnector.impalaConnectionUrl); - hikariConfig.setPoolName(poolName); - hikariConfig.setMaximumPoolSize(hikariMaxConnectionPoolSize); - hikariConfig.setMaxLifetime(hikariMaxLifetime); - hikariConfig.setMinimumIdle(hikariMinIdleConnections); - hikariConfig.setConnectionTimeout(hikariConnectionTimeOut); - hikariConfig.setIdleTimeout(hikariIdleTimeOut); - return new HikariDataSource(hikariConfig); - } - - - public void createDatabase() - { - Connection con = getConnection(); - if ( con == null ) - System.exit(22); - - try { - if ( !con.getMetaData().supportsBatchUpdates() ) - logger.warn("The database does not support \"BatchUpdates\"!"); - } catch (SQLException e) { - logger.error(e.getMessage(), e); - } - + private void createDatabase() { logger.info("Going to create the database and the tables, if they do not exist. Also will fill some tables with data from OpenAIRE."); - Statement statement = null; - try { - statement = con.createStatement(); - } catch (SQLException sqle) { - logger.error("Problem when creating a connection-statement!\n" + sqle.getMessage()); - ImpalaConnector.closeConnection(con); - System.exit(33); - } - try { - statement.execute("CREATE DATABASE IF NOT EXISTS " + databaseName); + jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS " + databaseName); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication stored as parquet as select * from " + oldDatabaseName + ".publication"); - statement.execute("COMPUTE STATS " + databaseName + ".publication"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication stored as parquet as select * from " + oldDatabaseName + ".publication"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".publication"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication_pids stored as parquet as select * from " + oldDatabaseName + ".publication_pids"); - statement.execute("COMPUTE STATS " + databaseName + ".publication_pids"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication_pids stored as parquet as select * from " + oldDatabaseName + ".publication_pids"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".publication_pids"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication_urls stored as parquet as select * from " + oldDatabaseName + ".publication_urls"); - statement.execute("COMPUTE STATS " + databaseName + ".publication_urls"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".publication_urls stored as parquet as select * from " + oldDatabaseName + ".publication_urls"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".publication_urls"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".datasource stored as parquet as select * from " + oldDatabaseName + ".datasource"); - statement.execute("COMPUTE STATS " + databaseName + ".datasource"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".datasource stored as parquet as select * from " + oldDatabaseName + ".datasource"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".datasource"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".assignment (id string, original_url string, workerid string, `date` timestamp) stored as parquet"); - statement.execute("COMPUTE STATS " + databaseName + ".assignment"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".assignment (id string, original_url string, workerid string, `date` timestamp) stored as parquet"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".assignment"); - statement.execute("DROP TABLE IF EXISTS " + ImpalaConnector.databaseName + ".current_assignment PURGE"); + jdbcTemplate.execute("DROP TABLE IF EXISTS " + ImpalaConnector.databaseName + ".current_assignment PURGE"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".attempt (id string, original_url string, `date` timestamp, status string, error_class string, error_message string) stored as parquet"); - statement.execute("COMPUTE STATS " + databaseName + ".attempt"); + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".attempt (id string, original_url string, `date` timestamp, status string, error_class string, error_message string) stored as parquet"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".attempt"); - statement.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".payload (id string, original_url string, actual_url string, `date` timestamp, mimetype string, size string, `hash` string, `location` string, provenance string) stored as parquet"); - statement.execute("COMPUTE STATS " + databaseName + ".payload"); - } catch (SQLException sqle) { - String errorMsg = "Problem when executing the \"create database and create tables queries!\n" + sqle.getMessage() + "\nSQL state: " + sqle.getSQLState() + "\nError code: " + sqle.getErrorCode(); - logger.error(errorMsg, sqle); - System.err.println(errorMsg); - System.exit(44); - } finally { - try { - statement.close(); - con.close(); - } catch (SQLException sqle2) { - logger.error("Could not close the connection with the Impala-database.\n" + sqle2.getMessage()); - } - } + jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".payload (id string, original_url string, actual_url string, `date` timestamp, mimetype string, size string, `hash` string, `location` string, provenance string) stored as parquet"); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".payload"); logger.info("The database \"" + databaseName + "\" and its tables were created or validated."); } - - public Connection getConnection() - { - try { - return hikariDataSource.getConnection(); - //return DriverManager.getConnection(impalaConnectionUrl, null, null); // This is for non pooled connections. - } catch (SQLException sqle) { - logger.error("Problem when connecting with the Impala-database!\n" + sqle.getMessage()); - return null; - } - } - - - public boolean testDatabaseAccess() - { - logger.info("Going to test Impala access.."); - Connection con = getConnection(); - if ( con == null ) - return false; - - ResultSet res = null; - try { - String tableName = "publication"; - - // show tables - String sql = "show tables '" + tableName + "'"; - logger.debug("Running: " + sql); - res = con.prepareStatement(sql).executeQuery(); - if ( res.next() ) { - logger.debug(res.getString(1)); - } - - // describe table - sql = "describe " + tableName; - logger.debug("Running: " + sql); - res = con.prepareStatement(sql).executeQuery(); - while ( res.next() ) { - logger.debug(res.getString(1) + "\t" + res.getString(2)); - } - - // select * query - sql = "select * from " + tableName + " limit 3;"; - logger.debug("Running: " + sql); - res = con.prepareStatement(sql).executeQuery(); - while ( res.next() ) { - logger.debug(res.getString(1)); - } - - // Get Assignments, only for testing here. - //UrlController urlController = new UrlController(); - //ResponseEntity responseEntity = urlController.getUrls("worker_1", ControllerConstants.ASSIGNMENTS_LIMIT); - //logger.debug(responseEntity.toString()); - - } catch (SQLException sqle) { - logger.error(sqle.getMessage(), sqle); - return false; - } finally { - try { - if ( res != null ) - res.close(); - con.close(); - } catch (SQLException sqle) { - logger.error("Could not close the connection with the Impala-database.\n" + sqle); - } - } - return true; - } - - - public static boolean closeConnection(Connection con) { - try { - if ( con != null ) - con.close(); // It may have already closed and that's fine. - return true; - } catch (SQLException sqle) { - logger.error("Could not close the connection with the Impala-database.\n" + sqle.getMessage()); - return false; - } - } - - - public static String handlePreparedStatementException(String queryName, String query, String preparedStatementName, PreparedStatement preparedStatement, Connection con, Exception e) + public static String handlePreparedStatementException(String queryName, String query, Exception e) { String errorMsg = "Problem when creating " + (( ! queryName.startsWith("get")) ? "and executing " : "") + "the prepared statement for \"" + queryName + "\"!\n"; - logger.error(errorMsg + "\n\n" + query + "\n\n" + e.getMessage(), e); - closeConnection(con); + logger.error(errorMsg + "\n\n" + query + "\n\n", e); return errorMsg; } - } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/ImpalaController.java b/src/main/java/eu/openaire/urls_controller/controllers/ImpalaController.java index c405640..ba29794 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/ImpalaController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/ImpalaController.java @@ -1,58 +1,41 @@ package eu.openaire.urls_controller.controllers; -import eu.openaire.urls_controller.configuration.ImpalaConnector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; +import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; -import java.sql.Connection; -import java.sql.ResultSet; -import java.util.ArrayList; import java.util.List; +/** + * This controller will test the connectivity with the database and return statistics! + */ @RestController @RequestMapping("/impala") public class ImpalaController { - - // This controller will test the connectivity with the database and return statistics! - private static final Logger logger = LoggerFactory.getLogger(ImpalaController.class); + @Autowired + private JdbcTemplate jdbcTemplate; @GetMapping("get10PublicationIdsTest") public ResponseEntity get10PublicationIdsTest() { - Connection con = ImpalaConnector.getInstance().getConnection(); - if ( con == null ) - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!"); - String query = "SELECT id FROM publication LIMIT 10;"; - try ( ResultSet res = con.prepareStatement(query).executeQuery()) { - if ( !res.first() ) { - String errorMsg = "No results retrieved from the \"getAssignmentsQuery\"!"; - logger.error(errorMsg); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } - - List publications = new ArrayList<>(); - do { - publications.add(res.getString(0)); - } while ( res.next() ); - - return new ResponseEntity(publications.toString(), HttpStatus.OK); + try { + List publications = jdbcTemplate.queryForList(query, String.class); + return new ResponseEntity<>(publications.toString(), HttpStatus.OK); } catch (Exception e) { String errorMsg = "Problem when executing \"getAssignmentsQuery\": " + query; logger.error(errorMsg, e); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - ImpalaConnector.closeConnection(con); } } - } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java new file mode 100644 index 0000000..433931e --- /dev/null +++ b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java @@ -0,0 +1,82 @@ +package eu.openaire.urls_controller.controllers; + +import com.google.common.collect.HashMultimap; +import eu.openaire.urls_controller.models.Assignment; +import eu.openaire.urls_controller.models.Datasource; +import eu.openaire.urls_controller.payloads.responces.AssignmentsResponse; +import eu.openaire.urls_controller.util.GenericUtils; +import eu.openaire.urls_controller.util.TestFileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.GetMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; + +import java.sql.Timestamp; +import java.util.*; +import java.util.concurrent.atomic.AtomicLong; + +@RestController +public class TestController extends GeneralController { + + private static final Logger logger = LoggerFactory.getLogger(TestController.class); + + @Autowired + private TestFileUtils fileUtils; + + @Value("services.pdfaggregation.controller.assignmentLimit") + private int assignmentLimit; + + private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. + + @GetMapping("test") + public ResponseEntity getTestUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { + + logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " test-assignments. The assignments-limit of the controller is: " + this.assignmentLimit); + + List assignments = new ArrayList<>(); + HashMultimap loadedIdUrlPairs; + boolean isFirstRun = true; + boolean assignmentsLimitReached = false; + Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records. + + // Start loading urls. + while ( true ) { + loadedIdUrlPairs = fileUtils.getNextIdUrlPairBatchFromJson(); // Take urls from jsonFile. + + if ( fileUtils.isFinishedLoading(loadedIdUrlPairs.isEmpty(), isFirstRun) ) // Throws RuntimeException which is automatically passed on. + break; + else + isFirstRun = false; + + Set> pairs = loadedIdUrlPairs.entries(); + + for ( Map.Entry pair : pairs ) { + if ( assignments.size() >= workerAssignmentsLimit ) { + assignmentsLimitReached = true; + break; + } + + int randomNum = GenericUtils.getRandomNumber(1, 5); + assignments.add(new Assignment(pair.getKey(), pair.getValue(), new Datasource("ID_" + randomNum, "NAME_" + randomNum), workerId, timestamp)); + }// end pairs-for-loop + + if ( assignmentsLimitReached ) { + logger.debug("Done loading urls from the inputFile as the assignmentsLimit (" + workerAssignmentsLimit + ") was reached."); + break; + } + }// end loading-while-loop + + Scanner scanner = fileUtils.inputScanner.get(); + if ( scanner != null ) // Check if the initial value is null. + scanner.close(); + + long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); + logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + fileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); + return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); + } +} diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 59dff72..78a894b 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -1,23 +1,25 @@ package eu.openaire.urls_controller.controllers; -import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.configuration.ImpalaConnector; import eu.openaire.urls_controller.models.Error; import eu.openaire.urls_controller.models.*; import eu.openaire.urls_controller.payloads.requests.WorkerReport; import eu.openaire.urls_controller.payloads.responces.AssignmentsResponse; -import eu.openaire.urls_controller.util.ControllerConstants; import eu.openaire.urls_controller.util.FileUtils; -import eu.openaire.urls_controller.util.GenericUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.jdbc.core.RowCallbackHandler; import org.springframework.web.bind.annotation.*; import javax.servlet.http.HttpServletRequest; import java.sql.*; -import java.util.*; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; @@ -27,11 +29,20 @@ public class UrlController { private static final Logger logger = LoggerFactory.getLogger(UrlController.class); + @Autowired + private JdbcTemplate jdbcTemplate; + + @Autowired + private FileUtils fileUtils; + private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. - private static final Pattern MALICIOUS_INPUT_STRING = Pattern.compile(".*[';`\"]+.*"); - private static int maxAttemptsPerRecord = ControllerConstants.MAX_ATTEMPTS_PER_RECORD; + @Value("services.pdfaggregation.controller.maxAttemptsPerRecord") + private int maxAttemptsPerRecord; + + @Value("services.pdfaggregation.controller.assignmentLimit") + private int assignmentLimit; @GetMapping("") public ResponseEntity getUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { @@ -43,7 +54,7 @@ public class UrlController { return ResponseEntity.status(HttpStatus.FORBIDDEN).body(errorMsg); } - logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " assignments. The assignments-limit of the controller is: " + ControllerConstants.ASSIGNMENTS_LIMIT); + logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " assignments. The assignments-limit of the controller is: " + assignmentLimit); // Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >. @@ -53,9 +64,9 @@ public class UrlController { String errorMsg = "The given \"workerAssignmentsLimit\" was ZERO!"; logger.error(errorMsg); return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg); - } else if ( assignmentsLimit > ControllerConstants.ASSIGNMENTS_LIMIT ) { - logger.warn("The given \"workerAssignmentsLimit\" (" + workerAssignmentsLimit + ") was larger than the Controller's limit (" + ControllerConstants.ASSIGNMENTS_LIMIT + "). Will use the Controller's limit."); - assignmentsLimit = ControllerConstants.ASSIGNMENTS_LIMIT; + } else if ( assignmentsLimit > assignmentLimit ) { + logger.warn("The given \"workerAssignmentsLimit\" (" + workerAssignmentsLimit + ") was larger than the Controller's limit (" + assignmentLimit + "). Will use the Controller's limit."); + assignmentsLimit = assignmentLimit; } String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" + @@ -80,129 +91,74 @@ public class UrlController { String computeCurrentAssignmentsStatsQuery = "COMPUTE STATS " + ImpalaConnector.databaseName + ".current_assignment"; String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment"; - List assignments = new ArrayList<>(assignmentsLimit); + List assignments = new ArrayList<>(); ImpalaConnector.databaseLock.lock(); - Connection con = ImpalaConnector.getInstance().getConnection(); - if ( con == null ) { // This is already logged in "getConnection()". + + try { + jdbcTemplate.execute(createAssignmentsQuery); + } catch (Exception sqle) { ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!"); } - // All transactions in Impala automatically commit at the end of the statement. Currently, Impala does not support multi-statement transactions. - // https://impala.apache.org/docs/build/html/topics/impala_transactions.html - // We cannot use "savePoints" along with "autoCommit = false" to roll back to a previous state among multiple statements. - - PreparedStatement createCurrentAssignmentsPreparedStatement = null; try { - createCurrentAssignmentsPreparedStatement = con.prepareStatement(createAssignmentsQuery); - // We cannot set the "limits" and the MAX_ATTEMPTS_PER_RECORD as preparedStatements parameters, as we get a "java.sql.SQLException: [Simba][JDBC](11420) Error, parameter metadata not populated." - createCurrentAssignmentsPreparedStatement.execute(); - } catch (SQLException sqle) { - ImpalaConnector.databaseLock.unlock(); - String errorMsg = ImpalaConnector.handlePreparedStatementException("createAssignmentsQuery", createAssignmentsQuery, "createCurrentAssignmentsPreparedStatement", createCurrentAssignmentsPreparedStatement, con, sqle); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - try { - if ( createCurrentAssignmentsPreparedStatement != null ) - createCurrentAssignmentsPreparedStatement.close(); - } catch (SQLException sqle2) { - logger.error("Failed to close the \"createCurrentAssignmentsPreparedStatement\"!\n" + sqle2.getMessage()); - } - } + jdbcTemplate.execute(computeCurrentAssignmentsStatsQuery); + } catch (Exception sqle) { + String errorMsg = dropCurrentAssignmentTable(); - PreparedStatement computeCurrentAssignmentsStatsPreparedStatement = null; - try { - computeCurrentAssignmentsStatsPreparedStatement = con.prepareStatement(computeCurrentAssignmentsStatsQuery); - computeCurrentAssignmentsStatsPreparedStatement.execute(); - } catch (SQLException sqle) { - String errorMsg = dropCurrentAssignmentTable(con); if ( errorMsg != null ) // The "databaseLock" is already unlocked. return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - ImpalaConnector.databaseLock.unlock(); - errorMsg = ImpalaConnector.handlePreparedStatementException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, "computeCurrentAssignmentsStatsPreparedStatement", computeCurrentAssignmentsStatsPreparedStatement, con, sqle); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - try { - if ( computeCurrentAssignmentsStatsPreparedStatement != null ) - computeCurrentAssignmentsStatsPreparedStatement.close(); - } catch (SQLException sqle2) { - logger.error("Failed to close the \"computeCurrentAssignmentsStatsPreparedStatement\"!\n" + sqle2.getMessage()); - } - } - PreparedStatement getAssignmentsPreparedStatement = null; - try { - getAssignmentsPreparedStatement = con.prepareStatement(getAssignmentsQuery); - } catch (SQLException sqle) { - String errorMsg = dropCurrentAssignmentTable(con); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); ImpalaConnector.databaseLock.unlock(); - errorMsg = ImpalaConnector.handlePreparedStatementException("getAssignmentsQuery", getAssignmentsQuery, "getAssignmentsPreparedStatement", getAssignmentsPreparedStatement, con, sqle); - // The "getAssignmentsPreparedStatement" will always be null here, so we do not close it. + errorMsg = ImpalaConnector.handlePreparedStatementException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, sqle); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records. - try ( ResultSet resultSet = getAssignmentsPreparedStatement.executeQuery() ) { - // Unfortunately, we cannot use the following as the used version of the Impala-driver does not support it. - /*if ( !resultSet.first() ) { - ImpalaConnector.databaseLock.unlock(); - String errorMsg = "No results retrieved from the \"getAssignmentsQuery\" for worker with id: " + workerId; - logger.error(errorMsg); - ImpalaConnector.closeConnection(con); - return ResponseEntity.status(HttpStatus.NO_CONTENT).body(errorMsg); - }*/ - - // The cursor is automatically before the first element in this configuration. - while ( resultSet.next() ) { // Move the cursor forward. - // If the resultsSet is empty, then the control will never get inside the loop. - // The following few lines, cannot be outside the "while" loop, since the same object is added, despite that we update the inner-values. - Assignment assignment = new Assignment(); - assignment.setWorkerId(workerId); - assignment.setTimestamp(timestamp); - Datasource datasource = new Datasource(); - try { // For each of the 4 columns returned. The indexing starts from 1 - assignment.setId(resultSet.getString(1)); - assignment.setOriginalUrl(resultSet.getString(2)); - datasource.setId(resultSet.getString(3)); - datasource.setName(resultSet.getString(4)); - } catch (SQLException sqle) { - logger.error("No value was able to be retrieved from one of the columns of row_" + resultSet.getRow(), sqle); - continue; // This object is broken, move to the next row. + try { + jdbcTemplate.query(getAssignmentsQuery, new RowCallbackHandler() { + @Override + public void processRow(ResultSet rs) throws SQLException { + Assignment assignment = new Assignment(); + assignment.setWorkerId(workerId); + assignment.setTimestamp(timestamp); + Datasource datasource = new Datasource(); + try { // For each of the 4 columns returned. The indexing starts from 1 + assignment.setId(rs.getString(1)); + assignment.setOriginalUrl(rs.getString(2)); + datasource.setId(rs.getString(3)); + datasource.setName(rs.getString(4)); + } catch (SQLException sqle) { + logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle); + } + assignment.setDatasource(datasource); + assignments.add(assignment); } - assignment.setDatasource(datasource); - assignments.add(assignment); - } + }); } catch (Exception e) { - String errorMsg = dropCurrentAssignmentTable(con); + String errorMsg = dropCurrentAssignmentTable(); if ( errorMsg != null ) // The "databaseLock" is already unlocked. return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + ImpalaConnector.databaseLock.unlock(); + errorMsg = "Problem when executing the \"getAssignmentsQuery\"!\n"; logger.error(errorMsg, e); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - try { - getAssignmentsPreparedStatement.close(); - } catch (SQLException sqle) { - logger.error("Failed to close the \"getAssignmentsPreparedStatement\"!\n" + sqle.getMessage()); - } } int assignmentsSize = assignments.size(); if ( assignmentsSize == 0 ) { - String errorMsg = dropCurrentAssignmentTable(con); + String errorMsg = dropCurrentAssignmentTable(); if ( errorMsg != null ) // The "databaseLock" is already unlocked. return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); ImpalaConnector.databaseLock.unlock(); maxAttemptsPerRecord += 2; // Increase the max-attempts to try again some very old records, in the next requests. errorMsg = "No results retrieved from the \"findAssignmentsQuery\" for worker with id: " + workerId + ". Will increase the \"maxAttempts\" to " + maxAttemptsPerRecord + " for the next requests."; logger.error(errorMsg); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.NO_CONTENT).body(errorMsg); } else if ( assignmentsSize < assignmentsLimit ) { maxAttemptsPerRecord += 2; // Increase the max-attempts to try again some very old records, in the next requests. @@ -218,48 +174,37 @@ public class UrlController { String insertAssignmentsQuery = "insert into " + ImpalaConnector.databaseName + ".assignment \n select pub_data.pubid, pub_data.url, '" + workerId + "', cast('" + timestamp + "' as timestamp)\n" + "from (\n select pubid, url from " + ImpalaConnector.databaseName + ".current_assignment) as pub_data"; - PreparedStatement insertAssignmentsPreparedStatement = null; try { - insertAssignmentsPreparedStatement = con.prepareStatement(insertAssignmentsQuery); - insertAssignmentsPreparedStatement.execute(); - } catch (SQLException sqle) { - String errorMsg = dropCurrentAssignmentTable(con); + jdbcTemplate.execute(insertAssignmentsQuery); + } catch (Exception sqle) { + String errorMsg = dropCurrentAssignmentTable(); if ( errorMsg != null ) // The "databaseLock" is already unlocked. return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); ImpalaConnector.databaseLock.unlock(); - errorMsg = ImpalaConnector.handlePreparedStatementException("insertAssignmentsQuery", insertAssignmentsQuery, "insertAssignmentsPreparedStatement", insertAssignmentsPreparedStatement, con, sqle); + errorMsg = ImpalaConnector.handlePreparedStatementException("insertAssignmentsQuery", insertAssignmentsQuery, sqle); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - try { - if ( insertAssignmentsPreparedStatement != null ) - insertAssignmentsPreparedStatement.close(); - } catch (SQLException sqle2) { - logger.error("Failed to close the \"insertAssignmentsPreparedStatement\"!\n" + sqle2.getMessage()); - } } - String errorMsg = dropCurrentAssignmentTable(con); + String errorMsg = dropCurrentAssignmentTable(); if ( errorMsg != null ) // The "databaseLock" is already unlocked. return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); logger.debug("Finished inserting " + assignmentsSize + " assignments into the \"assignment\"-table. Going to merge the parquet files for this table."); - String mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con, "", null); + String mergeErrorMsg = fileUtils.mergeParquetFiles("assignment", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); logger.info("Sending batch-assignments_" + curAssignmentsBatchCounter + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + "."); return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); } - @PostMapping("addWorkerReport") public ResponseEntity addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) { @@ -294,52 +239,22 @@ public class UrlController { logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + curWorkerId + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database."); // Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location". - FileUtils.UploadFullTextsResponse uploadFullTextsResponse = FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, curWorkerId); + FileUtils.UploadFullTextsResponse uploadFullTextsResponse = fileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, curWorkerId); if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.databaseError ) { return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem with the Impala-database!"); } else if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.unsuccessful ) { logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments); // The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads. - FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports); + fileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports); } ImpalaConnector.databaseLock.lock(); - Connection con = ImpalaConnector.getInstance().getConnection(); - if ( con == null ) { - ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!"); - } // Store the workerReport into the database. String insertIntoPayloadBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; String insertIntoAttemptBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; - String tempInsertQueryName = null; - PreparedStatement preparedInsertPayloadStatement = null, preparedInsertAttemptStatement = null; - try { - tempInsertQueryName = "insertIntoPayloadBaseQuery"; - preparedInsertPayloadStatement = con.prepareStatement(insertIntoPayloadBaseQuery); - tempInsertQueryName = "insertIntoAttemptBaseQuery"; - preparedInsertAttemptStatement = con.prepareStatement(insertIntoAttemptBaseQuery); - } catch (SQLException sqle) { - ImpalaConnector.databaseLock.unlock(); - String errorMsg = "Problem when creating the prepared statement for \"" + tempInsertQueryName + "\"!\n"; - logger.error(errorMsg + sqle.getMessage()); - closeStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, con); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } - - try { - con.setAutoCommit(false); // Avoid writing to disk for each insert. Write them all in the end. - } catch (SQLException sqle) { - ImpalaConnector.databaseLock.unlock(); - String errorMsg = "Problem when setting Connection.AutoCommit to \"false\"!\n"; - logger.error(errorMsg + sqle.getMessage()); - closeStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, con); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } - String payloadErrorMsg = null; int failedCount = 0; @@ -349,32 +264,27 @@ public class UrlController { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); + if ( payload == null ) { - logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments); + logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments); payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!"; + continue; } try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. - preparedInsertPayloadStatement.setString(1, payload.getId()); - preparedInsertPayloadStatement.setString(2, payload.getOriginal_url()); - preparedInsertPayloadStatement.setString(3, payload.getActual_url()); - preparedInsertPayloadStatement.setTimestamp(4, payload.getTimestamp_acquired()); - preparedInsertPayloadStatement.setString(5, payload.getMime_type()); + Object[] args = new Object[] { + payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(), + payload.getMime_type(), payload.getSize() != null?String.valueOf(payload.getSize()):null, payload.getHash(), + payload.getLocation(), payload.getProvenance()}; + int[] argTypes = new int[] { + Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, + Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; - // The column "size" in the table is of type "String" so we cast the Long to String. The Parquet-format in the database does not work well with integers. - String sizeStr = null; - Long size = payload.getSize(); - if ( size != null ) - sizeStr = String.valueOf(size); + jdbcTemplate.update(insertIntoPayloadBaseQuery, args, argTypes); - preparedInsertPayloadStatement.setString(6, sizeStr); - preparedInsertPayloadStatement.setString(7, payload.getHash()); - preparedInsertPayloadStatement.setString(8, payload.getLocation()); - preparedInsertPayloadStatement.setString(9, payload.getProvenance()); - preparedInsertPayloadStatement.executeUpdate(); - } catch (SQLException sqle) { - logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": " + sqle.getMessage() + "\n\n"); + } catch (Exception sqle) { + logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle); } Error error = urlReport.getError(); @@ -384,81 +294,58 @@ public class UrlController { } try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. - preparedInsertAttemptStatement.setString(1, payload.getId()); - preparedInsertAttemptStatement.setString(2, payload.getOriginal_url()); - preparedInsertAttemptStatement.setTimestamp(3, payload.getTimestamp_acquired()); - preparedInsertAttemptStatement.setString(4, urlReport.getStatus().toString()); - preparedInsertAttemptStatement.setString(5, String.valueOf(error.getType())); // This covers the case of "null" too. - preparedInsertAttemptStatement.setString(6, error.getMessage()); - preparedInsertAttemptStatement.executeUpdate(); - } catch (SQLException sqle) { - logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage() + "\n\n"); + Object[] args = new Object[] { + payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(), + urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()}; + int[] argTypes = new int[] { + Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, + Types.VARCHAR}; + + jdbcTemplate.update(insertIntoAttemptBaseQuery, args, argTypes); + } catch (Exception sqle) { + logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": ", sqle.getMessage()); } }//end for-loop - try { - con.commit(); // Commit all the insert-queries to the database (write them to disk). - } catch (SQLException sqle) { - ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); - String errorMsg = "Problem when committing changes to the database or when setting Connection.AutoCommit to \"true\"!"; - logger.error(errorMsg + "\n" + sqle.getMessage()); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - closeStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, null); // Do not close the connection here, as we might move forward. - } if ( payloadErrorMsg != null ) logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables, although " + payloadErrorMsg + " Going to merge the parquet files for those tables."); else logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables. Going to merge the parquet files for those tables."); - String mergeErrorMsg = FileUtils.mergeParquetFiles("payload", con, "", null); + String mergeErrorMsg = fileUtils.mergeParquetFiles("payload", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } - mergeErrorMsg = FileUtils.mergeParquetFiles("attempt", con, "", null); + mergeErrorMsg = fileUtils.mergeParquetFiles("attempt", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } // This will delete the rows of the "assignment" table which refer to the curWorkerId. As we have non-kudu Impala tables, the Delete operation can only succeed through a "merge" operation of the rest of the data. // Only the rows referring to OTHER workerIDs get stored in a temp-table, while the "assignment" table gets deleted. Then, the temp_table becomes the "assignment" table. // We do not need to keep the assignment-info anymore, the "findAssignmentsQuery" checks the payload table for previously handled tasks. - mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con, " WHERE workerid != ", curWorkerId); + mergeErrorMsg = fileUtils.mergeParquetFiles("assignment", " WHERE workerid != ", curWorkerId); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } - try { - con.commit(); // Apply the merges permanently (write them to disk). - con.setAutoCommit(true); // Restore the "auto-commit" value for this connection of the pool. - } catch (SQLException sqle) { - String errorMsg = "Problem when committing changes to the database!"; - logger.error(errorMsg + "\n" + sqle.getMessage()); - // The statements used in "mergeParquetFiles()" are already closed. - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - } finally { - ImpalaConnector.databaseLock.unlock(); - ImpalaConnector.closeConnection(con); - } + ImpalaConnector.databaseLock.unlock(); logger.debug("Finished merging the database tables."); return ResponseEntity.status(HttpStatus.OK).body(payloadErrorMsg); } - // The "batchExecute" does not work in this Impala-Database, so this is a "giant-query" solution. // Note: this causes an "Out of memory"-ERROR in the current version of the Impala JDBC driver. If a later version is provided, then this code should be tested. - private static PreparedStatement constructLargeInsertQuery(Connection con, String baseInsertQuery, int dataSize, int numParamsPerRow) throws RuntimeException - { + private static PreparedStatement constructLargeInsertQuery(Connection con, String baseInsertQuery, int dataSize, int numParamsPerRow) throws RuntimeException { StringBuilder sb = new StringBuilder(baseInsertQuery.length() + (dataSize * 6 * numParamsPerRow)); // TODO - Make this a global Thread-Local var. And then "clear" (reset) it after each use. sb.append(baseInsertQuery); for ( int i=1; i <= dataSize; ++i ) { @@ -484,98 +371,15 @@ public class UrlController { return preparedInsertStatement; } - - private boolean closeStatements(Statement statement1, Statement statement2, Connection con) { - try { - if ( statement1 != null ) - statement1.close(); - if ( statement2 != null ) - statement2.close(); - if ( con != null ) - con.close(); // It may have already closed and that's fine. - return true; - } catch (SQLException sqle) { - logger.error("Could not close the statements or the connection with the Impala-database.\n" + sqle.getMessage()); - return false; - } - } - - - @GetMapping("test") - public ResponseEntity getTestUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { - - logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " test-assignments. The assignments-limit of the controller is: " + ControllerConstants.ASSIGNMENTS_LIMIT); - - try { - new FileUtils(); // Find the input file. - } catch (Exception e) { - logger.error(e.getMessage()); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The resource file, for the requested assignments, was not found."); - } - - List assignments = new ArrayList<>(); - HashMultimap loadedIdUrlPairs; - boolean isFirstRun = true; - boolean assignmentsLimitReached = false; - Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records. - - // Start loading urls. - while ( true ) { - loadedIdUrlPairs = FileUtils.getNextIdUrlPairBatchFromJson(); // Take urls from jsonFile. - - if ( FileUtils.isFinishedLoading(loadedIdUrlPairs.isEmpty(), isFirstRun) ) // Throws RuntimeException which is automatically passed on. - break; - else - isFirstRun = false; - - Set> pairs = loadedIdUrlPairs.entries(); - - for ( Map.Entry pair : pairs ) - { - if ( assignments.size() >= workerAssignmentsLimit ) { - assignmentsLimitReached = true; - break; - } - - int randomNum = GenericUtils.getRandomNumber(1, 5); - assignments.add(new Assignment(pair.getKey(), pair.getValue(), new Datasource("ID_" + randomNum, "NAME_" + randomNum), workerId, timestamp)); - }// end pairs-for-loop - - if ( assignmentsLimitReached ) { - logger.debug("Done loading urls from the inputFile as the assignmentsLimit (" + workerAssignmentsLimit + ") was reached."); - break; - } - }// end loading-while-loop - - Scanner scanner = FileUtils.inputScanner.get(); - if ( scanner != null ) // Check if the initial value is null. - scanner.close(); - - long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); - logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); - return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); - } - - - private String dropCurrentAssignmentTable(Connection con) - { + private String dropCurrentAssignmentTable() { String dropCurrentAssignmentsQuery = "DROP TABLE " + ImpalaConnector.databaseName + ".current_assignment PURGE"; - PreparedStatement dropCurrentAssignmentsPreparedStatement = null; + try { - dropCurrentAssignmentsPreparedStatement = con.prepareStatement(dropCurrentAssignmentsQuery); - dropCurrentAssignmentsPreparedStatement.execute(); + jdbcTemplate.execute(dropCurrentAssignmentsQuery); return null; - } catch (SQLException sqle) { + } catch (Exception sqle) { ImpalaConnector.databaseLock.unlock(); - return ImpalaConnector.handlePreparedStatementException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, "dropCurrentAssignmentsPreparedStatement", dropCurrentAssignmentsPreparedStatement, con, sqle); - } finally { - try { - if ( dropCurrentAssignmentsPreparedStatement != null ) - dropCurrentAssignmentsPreparedStatement.close(); - } catch (SQLException sqle2) { - logger.error("Failed to close the \"dropCurrentAssignmentsPreparedStatement\"!\n" + sqle2.getMessage()); - } + return ImpalaConnector.handlePreparedStatementException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, sqle); } } - } diff --git a/src/main/java/eu/openaire/urls_controller/util/ControllerConstants.java b/src/main/java/eu/openaire/urls_controller/util/ControllerConstants.java deleted file mode 100644 index ffe86c8..0000000 --- a/src/main/java/eu/openaire/urls_controller/util/ControllerConstants.java +++ /dev/null @@ -1,10 +0,0 @@ -package eu.openaire.urls_controller.util; - - -public interface ControllerConstants { - - int ASSIGNMENTS_LIMIT = 100_000; // The upper assignments-limit the Controller can handle. If the worker's limit is above this one, then the controller's limit is used. Otherwise, the worker's limit will be applied. - - int MAX_ATTEMPTS_PER_RECORD = 3; // The maximum times a record can be processed, if each of the previous times failed with a "couldRetry" Error-Class. - -} diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java b/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java index ed88cda..287ce6a 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java @@ -2,6 +2,7 @@ package eu.openaire.urls_controller.util; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.springframework.stereotype.Component; import java.io.File; import java.io.FileInputStream; @@ -12,20 +13,16 @@ import java.nio.file.StandardCopyOption; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; - +@Component public class FileUnZipper { private static final Logger logger = LoggerFactory.getLogger(FileUnZipper.class); - - public static void unzipFolder(Path source, Path target) throws Exception - { - try ( ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())) ) - { + public void unzipFolder(Path source, Path target) throws Exception { + try ( ZipInputStream zis = new ZipInputStream(new FileInputStream(source.toFile())) ) { // Iterate over the files in zip and un-zip them. ZipEntry zipEntry = zis.getNextEntry(); - while ( zipEntry != null ) - { + while ( zipEntry != null ) { Path targetPath = zipSlipProtect(zipEntry, target); if ( zipEntry.getName().endsWith(File.separator) ) // If we have a directory. @@ -44,10 +41,8 @@ public class FileUnZipper { } } - // Protect from a Zip Slip attack: https://snyk.io/research/zip-slip-vulnerability - public static Path zipSlipProtect(ZipEntry zipEntry, Path targetDir) throws IOException - { + public Path zipSlipProtect(ZipEntry zipEntry, Path targetDir) throws IOException { Path targetDirResolved = targetDir.resolve(zipEntry.getName()); // Make sure normalized file still has targetDir as its prefix, else throw an exception. Path normalizePath = targetDirResolved.normalize(); @@ -56,5 +51,4 @@ public class FileUnZipper { } return normalizePath; } - -} +} \ No newline at end of file diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 01f66d8..8320524 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -3,12 +3,15 @@ package eu.openaire.urls_controller.util; import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.configuration.ImpalaConnector; import eu.openaire.urls_controller.models.Payload; -import eu.openaire.urls_controller.models.Task; import eu.openaire.urls_controller.models.UrlReport; +import org.codehaus.groovy.syntax.Types; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.boot.configurationprocessor.json.JSONException; -import org.springframework.boot.configurationprocessor.json.JSONObject; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.dao.DataAccessException; +import org.springframework.jdbc.core.JdbcTemplate; +import org.springframework.stereotype.Component; import javax.servlet.http.HttpServletRequest; import java.io.*; @@ -17,51 +20,38 @@ import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.sql.*; -import java.util.*; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; +@Component public class FileUtils { private static final Logger logger = LoggerFactory.getLogger(FileUtils.class); - public static ThreadLocal inputScanner = new ThreadLocal(); // Every Thread has its own variable. - private static final ThreadLocal fileIndex = new ThreadLocal(); - private static final ThreadLocal unretrievableInputLines = new ThreadLocal(); - public static ThreadLocal duplicateIdUrlEntries = new ThreadLocal(); - public static final int jsonBatchSize = 3000; - private static final String utf8Charset = "UTF-8"; - public static String inputFileFullPath; - private static final String workingDir = System.getProperty("user.dir") + File.separator; + @Autowired + private JdbcTemplate jdbcTemplate; + @Autowired + private S3ObjectStore s3ObjectStore; - public FileUtils() throws RuntimeException - { - inputFileFullPath = workingDir + "src" + File.separator + "main" + File.separator + "resources"; - String resourceFileName = "testInputFiles" + File.separator + "orderedList1000.json"; - inputFileFullPath += File.separator + resourceFileName; - InputStream inputStream = getClass().getClassLoader().getResourceAsStream(resourceFileName); - if ( inputStream == null ) - throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\"."); + @Autowired + private FileUnZipper fileUnZipper; - logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName); + public enum UploadFullTextsResponse {successful, unsuccessful, databaseError} - FileUtils.inputScanner.set(new Scanner(inputStream, utf8Charset)); - fileIndex.set(0); // Re-initialize the file-number-pointer. - unretrievableInputLines.set(0); - duplicateIdUrlEntries.set(0); + public FileUtils() throws RuntimeException { } - /** * In each insertion, a new parquet-file is created, so we end up with millions of files. Parquet is great for fast-select, so have to stick with it and merge those files.. * This method, creates a clone of the original table in order to have only one parquet file in the end. Drops the original table. * Renames the clone to the original's name. * Returns the errorMsg, if an error appears, otherwise is returns "null". * */ - public static String mergeParquetFiles(String tableName, Connection con, String whereClause, String parameter) - { + public String mergeParquetFiles(String tableName, String whereClause, String parameter) { String errorMsg; if ( tableName == null ) { errorMsg = "No tableName was given. Do not know the tableName for which we should merger the underlying files for!"; @@ -78,42 +68,29 @@ public class FileUtils { else parameter = " '" + parameter + "'"; // This will be a "string-check". - Statement statement; try { - statement = con.createStatement(); - } catch (SQLException sqle) { - errorMsg = "Problem when creating a connection-statement!\n"; - logger.error(errorMsg + sqle.getMessage()); - return errorMsg; - } - - try { - statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName + " " + whereClause + parameter); - statement.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE"); - statement.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName); - statement.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName); - } catch (SQLException sqle) { + jdbcTemplate.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName + " " + whereClause + parameter); + jdbcTemplate.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE"); + jdbcTemplate.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName); + jdbcTemplate.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName); + } catch (DataAccessException e) { errorMsg = "Problem when executing the \"clone-drop-rename\" queries!\n"; - logger.error(errorMsg + getCutBatchExceptionMessage(sqle.getMessage()), sqle); + logger.error(errorMsg, e); return errorMsg; - } finally { - // Make sure we close the statement. - try { statement.close(); } - catch (SQLException sqle3) { logger.error("Could not close the statement for executing queries in the Impala-database.\n" + sqle3); } } return null; // No errorMsg, everything is fine. } + private final Pattern FILENAME_ID = Pattern.compile("([\\w_:]+)\\.[\\w]{2,10}$"); + private final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:]+\\.[\\w]{2,10})$"); - public enum UploadFullTextsResponse {successful, unsuccessful, databaseError}; - private static final Pattern FILENAME_ID = Pattern.compile("([\\w_:]+)\\.[\\w]{2,10}$"); - private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:]+\\.[\\w]{2,10})$"); - public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator; - private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). + @Value("services.pdfaggregation.controller.baseTargetLocation") + private String baseTargetLocation; - public static UploadFullTextsResponse getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) - { + private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). + + public UploadFullTextsResponse getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { // The Controller have to request the files from the Worker, in order to upload them to the S3. // We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database. @@ -126,30 +103,15 @@ public class FileUtils { remoteAddr = request.getRemoteAddr(); ImpalaConnector.databaseLock.lock(); - Connection con = ImpalaConnector.getInstance().getConnection(); - if ( con == null ) { - ImpalaConnector.databaseLock.unlock(); - logger.error("Problem when creating the Impala-connection!"); - return UploadFullTextsResponse.databaseError; - } String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ?" ; - PreparedStatement getFileLocationForHashPreparedStatement = null; - try { - getFileLocationForHashPreparedStatement = con.prepareStatement(getFileLocationForHashQuery); - } catch (SQLException sqle) { - ImpalaConnector.databaseLock.unlock(); - logger.error("Problem when creating the prepared statement for \"" + getFileLocationForHashQuery + "\"!\n" + sqle.getMessage()); - return UploadFullTextsResponse.databaseError; - } // Get the file-locations. int numFullTextUrlsFound = 0; int numFilesFoundFromPreviousAssignmentsBatches = 0; HashMultimap allFileNamesWithIDsHashMap = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. - for ( UrlReport urlReport : urlReports ) - { + for ( UrlReport urlReport : urlReports ) { UrlReport.StatusType statusType = urlReport.getStatus(); if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) { continue; @@ -160,7 +122,7 @@ public class FileUtils { if ( payload == null ) continue; - String fileLocation = null; + String fileLocation; // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. @@ -168,32 +130,24 @@ public class FileUtils { // Use the same prepared-statement for all requests, to improve speed (just like when inserting similar thing to the DB). String fileHash = payload.getHash(); if ( fileHash != null ) { - try { - getFileLocationForHashPreparedStatement.setString(1, fileHash); - } catch (SQLException sqle) { - logger.error("Error when setting the parameter in \"getFileLocationForHashQuery\"!\n" + sqle.getMessage()); + + fileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, new int[] {Types.STRING}, String.class); + + if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded. + payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical. + + //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG! + numFilesFoundFromPreviousAssignmentsBatches ++; + + continue; // Do not request the file from the worker, it's already uploaded. Move on. } - try ( ResultSet resultSet = getFileLocationForHashPreparedStatement.executeQuery() ) { - if ( resultSet.next() ) { // Move the "cursor" to the first row. If there is any data, then take the first result (there should not be more, but we still want the first anyway). - fileLocation = resultSet.getString(1); - if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded. - payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical. - //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG! - numFilesFoundFromPreviousAssignmentsBatches ++; - continue; // Do not request the file from the worker, it's already uploaded. Move on. - } - } - } catch (Exception e) { - logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n" + e.getMessage()); - - // TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database?? - // TODO - Since the database will have problems.. there is no point in trying to insert the payloads to Impala (we will handle it like: we tried to insert and got an error). - // TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not auto-closed here)and the Database connection. - } + // TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database?? + // TODO - Since the database will have problems.. there is no point in trying to insert the payloads to Impala (we will handle it like: we tried to insert and got an error). + // TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not auto-closed here)and the Database connection. } - // If the full-text of this record was not found by a previous batch.. + // If the full-text of this record was not found by a previous batch... fileLocation = payload.getLocation(); if ( fileLocation != null ) { // If the docFile was downloaded (without an error).. Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation); @@ -209,16 +163,7 @@ public class FileUtils { } } - // Close the Prepared Statement. - try { - if ( getFileLocationForHashPreparedStatement != null ) - getFileLocationForHashPreparedStatement.close(); - } catch (SQLException sqle) { - logger.error("Failed to close the \"getFileLocationForHashPreparedStatement\"!\n" + sqle.getMessage()); - } finally { - ImpalaConnector.databaseLock.unlock(); // The rest work of this function does not use the database. - ImpalaConnector.closeConnection(con); - } + ImpalaConnector.databaseLock.unlock(); // The rest work of this function does not use the database. logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ")."); logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches); @@ -252,8 +197,7 @@ public class FileUtils { File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation); int failedBatches = 0; - for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) - { + for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) { List fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter); HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId); if ( conn == null ) { @@ -280,7 +224,7 @@ public class FileUtils { //logger.debug("The zip file has been saved: " + zipFileFullPath); // DEBUG! - FileUnZipper.unzipFolder(Paths.get(zipFileFullPath), curBatchPath); + fileUnZipper.unzipFolder(Paths.get(zipFileFullPath), curBatchPath); String[] fileNames = new File(targetDirectory).list(); if ( (fileNames == null) || (fileNames.length <= 1 ) ) { // The directory might have only one file, the "zip-file". @@ -311,7 +255,7 @@ public class FileUtils { // At this point, we know that this file is related with one or more IDs of the payloads AND it has a valid fileName. // Let's try to upload the file to S3 and update the payloads of all related IDs, either in successful upload or not. - String s3Url = S3ObjectStoreMinIO.uploadToS3(fileName, fileFullPath); + String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); if ( s3Url != null ) { setFullTextForMultipleIDs(fileRelatedIDs, payloadsHashMultimap, s3Url); // It checks weather (s3Url != null) and acts accordingly. numUploadedFiles ++; @@ -342,9 +286,7 @@ public class FileUtils { } } - - private static HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List fileNamesForCurBatch, int totalBatches, String workerId) - { + private HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List fileNamesForCurBatch, int totalBatches, String workerId) { baseUrl += batchNum + "/"; String requestUrl = getRequestUrlForBatch(baseUrl, fileNamesForCurBatch); logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]"); @@ -366,9 +308,7 @@ public class FileUtils { return conn; } - - private static String getErrorMessageFromResponseBody(HttpURLConnection conn) - { + private String getErrorMessageFromResponseBody(HttpURLConnection conn) { StringBuilder errorMsgStrB = new StringBuilder(500); try ( BufferedReader br = new BufferedReader(new InputStreamReader(conn.getErrorStream())) ) { // Try-with-resources String inputLine; @@ -387,9 +327,7 @@ public class FileUtils { } } - - private static List getFileNamesForBatch(List allFileNames, int numAllFullTexts, int curBatch) - { + private List getFileNamesForBatch(List allFileNames, int numAllFullTexts, int curBatch) { int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch); int endingIndex = (curBatch * numOfFullTextsPerBatch); if ( endingIndex > numAllFullTexts ) // This might be the case, when the "numAllFullTexts" is too small. @@ -406,12 +344,9 @@ public class FileUtils { return fileNamesOfCurBatch; } + private String getRequestUrlForBatch(String baseUrl, List fileNamesForCurBatch) { + final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50); - private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50); - // TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests. - - private static String getRequestUrlForBatch(String baseUrl, List fileNamesForCurBatch) - { sb.append(baseUrl); int numFullTextsCurBatch = fileNamesForCurBatch.size(); for ( int j=0; j < numFullTextsCurBatch; ++j ){ @@ -420,14 +355,13 @@ public class FileUtils { sb.append(","); } String requestUrl = sb.toString(); - sb.setLength(0); // Reset for the next batch. + return requestUrl; } + private final int bufferSize = 20971520; // 20 MB - private static final int bufferSize = 20971520; // 20 MB - public static boolean saveZipFile(HttpURLConnection conn, File zipFile) - { + public boolean saveZipFile(HttpURLConnection conn, File zipFile) { InputStream inStream = null; FileOutputStream outStream = null; try { @@ -454,9 +388,7 @@ public class FileUtils { } } - - private static boolean isFileNameProblematic(String fileName, HashMultimap payloadsHashMultimap) - { + private boolean isFileNameProblematic(String fileName, HashMultimap payloadsHashMultimap) { // Get the ID of the file. Matcher matcher = FILENAME_ID.matcher(fileName); if ( !matcher.matches() ) { @@ -492,16 +424,13 @@ public class FileUtils { return true; } - /** * This method updates the UrlReports to not point to any downloaded fullText files. * This is useful when the uploading process of the fullTexts to the S3-ObjectStore fails. * Then, we don't want any "links" to locally stored files, which will be deleted. * @param urlReports - * @return */ - public static void updateUrlReportsToHaveNoFullTextFiles(List urlReports) - { + public void updateUrlReportsToHaveNoFullTextFiles(List urlReports) { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload != null ) @@ -509,22 +438,18 @@ public class FileUtils { } } - - private static void replaceNotUploadedFileLocations(List urlReports) - { + private void replaceNotUploadedFileLocations(List urlReports) { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload != null ) { String fileLocation = payload.getLocation(); - if ( (fileLocation != null) && (! fileLocation.startsWith(S3ObjectStoreMinIO.endpoint)) ) + if ( (fileLocation != null) && (! s3ObjectStore.locationInStore(fileLocation)) ) setUnretrievedFullText(payload); } } } - - public static void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMultimap payloadsHashMultimap, List fileNames) - { + public void updateUrlReportsForCurBatchTOHaveNoFullTextFiles(HashMultimap payloadsHashMultimap, List fileNames) { for ( String fileName : fileNames ) { // Get the ID of the file. Matcher matcher = FILENAME_ID.matcher(fileName); @@ -543,9 +468,7 @@ public class FileUtils { } } - - public static void setUnretrievedFullText(Payload payload) - { + public void setUnretrievedFullText(Payload payload) { // Mark the full-text as not-retrieved, since it will be deleted from local-storage. The retrieved link to the full-text will be kept. payload.setLocation(null); payload.setHash(null); @@ -553,15 +476,13 @@ public class FileUtils { payload.setSize(null); } - /** * Set the fileLocation for all those IDs related to the File. The IDs may have one or more payloads. * @param fileIDs * @param payloadsHashMultimap * @param s3Url */ - public static void setFullTextForMultipleIDs(Set fileIDs, HashMultimap payloadsHashMultimap, String s3Url) - { + public void setFullTextForMultipleIDs(Set fileIDs, HashMultimap payloadsHashMultimap, String s3Url) { for ( String id : fileIDs ) { Set payloads = payloadsHashMultimap.get(id); if ( payloads.isEmpty() ) { @@ -575,8 +496,7 @@ public class FileUtils { } } - - public static boolean deleteDirectory(File curBatchDir) { + public boolean deleteDirectory(File curBatchDir) { try { org.apache.commons.io.FileUtils.deleteDirectory(curBatchDir); return true; @@ -585,136 +505,4 @@ public class FileUtils { return false; } } - - - private static String getCutBatchExceptionMessage(String sqleMessage) - { - // The sqleMessage contains the actual message followed by the long batch. This makes the logs unreadable. So we should shorten the message before logging. - int maxEnding = 1500; - if ( sqleMessage.length() > maxEnding ) - return (sqleMessage.substring(0, maxEnding) + "..."); - else - return sqleMessage; - } - - - // This is currently not used, but it may be useful in a future scenario. - private static long getInputFileLinesNum() - { - long numOfLines = 0; - try { - numOfLines = Files.lines(Paths.get(inputFileFullPath)).count(); - logger.debug("The numOfLines in the inputFile is " + numOfLines); - } catch (IOException e) { - logger.error("Could not retrieve the numOfLines. " + e); - return -1; - } - return numOfLines; - } - - - /** - * This method decodes a Json String and returns its members. - * @param jsonLine String - * @return HashMap - */ - public static Task jsonDecoder(String jsonLine) - { - // Get ID and url and put them in the HashMap - String idStr = null; - String urlStr = null; - try { - JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine. - idStr = jObj.get("id").toString(); - urlStr = jObj.get("url").toString(); - } catch (JSONException je) { - logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je); - return null; - } - - if ( urlStr.isEmpty() ) { - if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. - logger.warn("The url was not found for id: \"" + idStr + "\""); - return null; - } - return new Task(idStr, urlStr, null); - } - - - /** - * This method parses a Json file and extracts the urls, along with the IDs. - * @return HashMultimap - */ - public static HashMultimap getNextIdUrlPairBatchFromJson() - { - Task inputIdUrlTuple; - int expectedPathsPerID = 5; - int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; - - HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); - - int curBeginning = fileIndex.get(); - - while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) ) - {// While (!EOF) and inside the current url-batch, iterate through lines. - - //logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG! - - // Take each line, remove potential double quotes. - String retrievedLineStr = inputScanner.get().nextLine(); - //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! - - fileIndex.set(fileIndex.get() +1); - - if ( retrievedLineStr.isEmpty() ) { - unretrievableInputLines.set(unretrievableInputLines.get() +1); - continue; - } - - if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes. - logger.warn("A problematic inputLine found: \t" + retrievedLineStr); - unretrievableInputLines.set(unretrievableInputLines.get() +1); - continue; - } - - if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl. - duplicateIdUrlEntries.set(duplicateIdUrlEntries.get() +1); - } - } - - return idAndUrlMappedInput; - } - - - /** - * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. - * @return loadedUrls - */ - public static int getCurrentlyLoadedUrls() // In the end, it gives the total number of urls we have processed. - { - return FileUtils.fileIndex.get() - FileUtils.unretrievableInputLines.get(); - } - - - /** - * This method checks if there is no more input-data and returns true in that case. - * Otherwise, it returns false, if there is more input-data to be loaded. - * A "RuntimeException" is thrown if no input-urls were retrieved in general. - * @param isEmptyOfData - * @param isFirstRun - * @return finished loading / not finished - * @throws RuntimeException - */ - public static boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) - { - if ( isEmptyOfData ) { - if ( isFirstRun ) - logger.error("Could not retrieve any urls from the inputFile!"); - else - logger.debug("Done loading " + FileUtils.getCurrentlyLoadedUrls() + " urls from the inputFile."); - return true; - } - return false; - } - -} +} \ No newline at end of file diff --git a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java new file mode 100644 index 0000000..c62c41d --- /dev/null +++ b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java @@ -0,0 +1,141 @@ +package eu.openaire.urls_controller.util; + +import io.minio.*; +import io.minio.messages.Bucket; +import io.minio.messages.Item; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.stereotype.Component; + +import javax.annotation.PostConstruct; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +@Component +public class S3ObjectStore { + + private static final Logger logger = LoggerFactory.getLogger(S3ObjectStore.class); + + @Value("services.pdfaggregation.controller.s3.endpoint") + private String endpoint = null; // This is useful to be "public", to test file-locations. + @Value("services.pdfaggregation.controller.s3.accessKey") + private String accessKey = null; + @Value("services.pdfaggregation.controller.s3.secretKey") + private String secretKey = null; + @Value("services.pdfaggregation.controller.s3.region") + private String region = null; + @Value("services.pdfaggregation.controller.s3.bucketName") + private String bucketName = null; + + @Value("services.pdfaggregation.controller.s3.shouldEmptyBucket") + private boolean shouldEmptyBucket = false; // Set true only for testing! + @Value("services.pdfaggregation.controller.s3.shouldShowAllS3Buckets") + private boolean shouldShowAllS3Buckets = false; + + private MinioClient minioClient; + + @PostConstruct + public void init() throws Exception { + this.minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).region(region).build(); + + boolean bucketExists = minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build()); + + // Keep this commented-out to avoid objects-deletion by accident. The code is open-sourced, so it's easy to enable this ability if we really want it (e.g. for testing). + if ( bucketExists && shouldEmptyBucket ) { + emptyBucket(bucketName, false); + //throw new RuntimeException("stop just for test!"); + } + + // Make the bucket, if not exist. + if ( !bucketExists ) { + logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it.."); + minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build()); + } else + logger.debug("Bucket \"" + bucketName + "\" already exists."); + + if ( shouldShowAllS3Buckets ) { + List buckets = null; + try { + buckets = minioClient.listBuckets(); + logger.debug("The buckets in the S3 ObjectStore are:"); + for ( Bucket bucket : buckets ) { + logger.debug(bucket.name()); + } + } catch (Exception e) { + logger.warn("Could not listBuckets: " + e.getMessage()); + } + } + } + + private final Pattern EXTENSION_PATTERN = Pattern.compile("(\\.[^.]+)$"); + + /** + * @param fileObjKeyName = "**File object key name**"; + * @param fileFullPath = "**Path of the file to upload**"; + * @return the url of the uploaded file + */ + public String uploadToS3(String fileObjKeyName, String fileFullPath) throws Exception { + String contentType = null; + + // Take the Matcher to retrieve the extension. + Matcher extensionMatcher = EXTENSION_PATTERN.matcher(fileFullPath); + if ( extensionMatcher.find() ) { + String extension = null; + if ( (extension = extensionMatcher.group(0)) == null ) + contentType = "application/pdf"; + else { + if ( extension.equals("pdf") ) + contentType = "application/pdf"; + /*else if ( *//* TODO - other-extension-match *//* ) + contentType = "application/pdf"; */ + else + contentType = "application/pdf"; + } + } else { + logger.warn("The file with key \"" + fileObjKeyName + "\" does not have a file-extension! Setting the \"pdf\"-mimeType."); + contentType = "application/pdf"; + } + + minioClient.uploadObject(UploadObjectArgs.builder() + .bucket(bucketName) + .object(fileObjKeyName).filename(fileFullPath) + .contentType(contentType).build()); + + // TODO - What if the fileObjKeyName already exists? + // Right now it gets overwritten (unless we add versioning, which is irrelevant for different objects..) + + String s3Url = endpoint + "/" + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public. + //logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url); + return s3Url; + } + + public void emptyBucket(String bucketName, boolean shouldDeleteBucket) throws Exception { + logger.warn("Going to " + (shouldDeleteBucket ? "delete" : "empty") + " bucket \"" + bucketName + "\""); + + // First list the objects of the bucket. + Iterable> results = minioClient.listObjects(ListObjectsArgs.builder().bucket(bucketName).build()); + + // Then, delete the objects. + for ( Result resultItem : results ) + try { + deleteFile(resultItem.get().objectName(), bucketName); + } catch (Exception e) { + logger.warn("Could not remove " + resultItem.get().objectName()); + } + + if ( shouldDeleteBucket ) { + // Lastly, delete the empty bucket. + minioClient.removeBucket(RemoveBucketArgs.builder().bucket(bucketName).build()); + } + } + + public boolean locationInStore(String location) { + return location.startsWith(endpoint); + } + + private void deleteFile(String fileObjKeyName, String bucketName) throws Exception { + minioClient.removeObject(RemoveObjectArgs.builder().bucket(bucketName).object(fileObjKeyName).build()); + } +} diff --git a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java deleted file mode 100644 index 642e803..0000000 --- a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStoreMinIO.java +++ /dev/null @@ -1,226 +0,0 @@ -package eu.openaire.urls_controller.util; - -import io.minio.*; -import io.minio.messages.Bucket; -import io.minio.messages.Item; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.util.List; -import java.util.Scanner; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - - -public class S3ObjectStoreMinIO { - - private static final Logger logger = LoggerFactory.getLogger(S3ObjectStoreMinIO.class); - - public static String endpoint = null; // This is useful to be "public", to test file-locations. - private static String accessKey = null; - private static String secretKey = null; - private static String region = null; - private static String bucketName = null; - - private static MinioClient minioClient; - - public static final boolean shouldEmptyBucket = false; // Set true only for testing! - public static final String credentialsFilePath = System.getProperty("user.dir") + File.separator + "S3_minIO_credentials.txt"; - private static final boolean shouldShowAllS3Buckets = false; - - - /** - * This must be called before any other methods. - * */ - public S3ObjectStoreMinIO() - { - // Take the credentials from the file. - Scanner myReader = null; - try { - File credentialsFile = new File(credentialsFilePath); - if ( !credentialsFile.exists() ) { - throw new RuntimeException("credentialsFile \"" + credentialsFilePath + "\" does not exists!"); - } - myReader = new Scanner(credentialsFile); - if ( myReader.hasNextLine() ) { - String[] credentials = myReader.nextLine().split(","); - if ( credentials.length < 5 ) { - throw new RuntimeException("Not all credentials were retrieved from file \"" + credentialsFilePath + "\"!"); - } - endpoint = credentials[0].trim(); - accessKey = credentials[1].trim(); - secretKey = credentials[2].trim(); - region = credentials[3].trim(); - bucketName = credentials[4].trim(); - } - } catch (Exception e) { - String errorMsg = "An error prevented the retrieval of the minIO credentials from the file: " + credentialsFilePath + "\n" + e.getMessage(); - logger.error(errorMsg, e); - System.err.println(errorMsg); - System.exit(53); - } finally { - if ( myReader != null ) - myReader.close(); - } - - if ( (endpoint == null) || (accessKey == null) || (secretKey == null) || (region == null) || (bucketName == null) ) { - String errorMsg = "No \"endpoint\" or/and \"accessKey\" or/and \"secretKey\" or/and \"region\" or/and \"bucketName\" could be retrieved from the file: " + credentialsFilePath; - logger.error(errorMsg); - System.err.println(errorMsg); - System.exit(54); - } - // It's not safe, nor helpful to show the credentials in the logs. - - minioClient = MinioClient.builder().endpoint(endpoint).credentials(accessKey, secretKey).region(region).build(); - - boolean bucketExists = false; - try { - bucketExists = minioClient.bucketExists(BucketExistsArgs.builder().bucket(bucketName).build()); - } catch (Exception e) { - String errorMsg = "There was a problem while checking if the bucket \"" + bucketName + "\" exists!\n" + e.getMessage(); - logger.error(errorMsg); - System.err.println(errorMsg); - System.exit(55); - } - - // Keep this commented-out to avoid objects-deletion by accident. The code is open-sourced, so it's easy to enable this ability if we really want it (e.g. for testing). -/* if ( bucketExists && shouldEmptyBucket ) { - emptyBucket(bucketName, false); - //throw new RuntimeException("stop just for test!"); - }*/ - - // Make the bucket, if not exist. - try { - if ( !bucketExists ) { - logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it.."); - minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build()); - } - else - logger.debug("Bucket \"" + bucketName + "\" already exists."); - } catch (Exception e) { - String errorMsg = "Could not create the bucket \"" + bucketName + "\"!"; - logger.error(errorMsg ,e); - System.err.println(errorMsg); - System.exit(56); - } - - if ( shouldShowAllS3Buckets ) { - List buckets = null; - try { - buckets = minioClient.listBuckets(); - logger.debug("The buckets in the S3 ObjectStore are:"); - for ( Bucket bucket : buckets ) { - logger.debug(bucket.name()); - } - } catch (Exception e) { - logger.warn("Could not listBuckets: " + e.getMessage()); - } - } - } - - - public static final Pattern EXTENSION_PATTERN = Pattern.compile("(\\.[^.]+)$"); - - /** - * @param fileObjKeyName = "**File object key name**"; - * @param fileFullPath = "**Path of the file to upload**"; - * @return - */ - public static String uploadToS3(String fileObjKeyName, String fileFullPath) - { - String contentType = null; - - // Take the Matcher to retrieve the extension. - Matcher extensionMatcher = EXTENSION_PATTERN.matcher(fileFullPath); - if ( extensionMatcher.find() ) { - String extension = null; - if ( (extension = extensionMatcher.group(0)) == null ) - contentType = "application/pdf"; - else { - if ( extension.equals("pdf") ) - contentType = "application/pdf"; - /*else if ( *//* TODO - other-extension-match *//* ) - contentType = "application/pdf"; */ - else - contentType = "application/pdf"; - } - } else { - logger.warn("The file with key \"" + fileObjKeyName + "\" does not have a file-extension! Setting the \"pdf\"-mimeType."); - contentType = "application/pdf"; - } - - ObjectWriteResponse response; - try { - response = minioClient.uploadObject(UploadObjectArgs.builder() - .bucket(bucketName) - .object(fileObjKeyName).filename(fileFullPath) - .contentType(contentType).build()); - - // TODO - What if the fileObjKeyName already exists? - // Right now it gets overwritten (unless we add versioning, which is irrelevant for different objects..) - - } catch (Exception e) { - logger.error("Could not upload the file \"" + fileObjKeyName + "\" to the S3 ObjectStore, exception: " + e.getMessage(), e); - return null; - } - - String s3Url = endpoint + "/" + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public. - //logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url); - return s3Url; - } - - - public static boolean emptyBucket(String bucketName, boolean shouldDeleteBucket) - { - logger.warn("Going to " + (shouldDeleteBucket ? "delete" : "empty") + " bucket \"" + bucketName + "\""); - - // First list the objects of the bucket. - Iterable> results; - try { - results = minioClient.listObjects(ListObjectsArgs.builder().bucket(bucketName).build()); - } catch (Exception e) { - logger.error("Could not retrieve the list of objects of bucket \"" + bucketName + "\"!"); - return false; - } - - // Then, delete the objects. - for ( Result resultItem : results ) { - try { - if ( !deleteFile(resultItem.get().objectName(), bucketName) ) { - logger.error("Cannot proceed with bucket deletion, since only an empty bucket can be removed!"); - return false; - } - } catch (Exception e) { - logger.error("Error getting the object from resultItem: " + resultItem.toString() + "\nThe bucket \"" + bucketName + "\" will not be able to be deleted! Exception message: " + e.getMessage()); - return false; - } - } - - if ( shouldDeleteBucket ) { - // Lastly, delete the empty bucket. - try { - minioClient.removeBucket(RemoveBucketArgs.builder().bucket(bucketName).build()); - } catch (Exception e) { - logger.error("Could not delete the bucket \"" + bucketName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e); - return false; - } - } - - return true; - } - - - public static boolean deleteFile(String fileObjKeyName, String bucketName) - { - try { - minioClient.removeObject(RemoveObjectArgs.builder().bucket(bucketName).object(fileObjKeyName).build()); - } catch (Exception e) { - logger.error("Could not delete the file \"" + fileObjKeyName + "\" from the S3 ObjectStore, exception: " + e.getMessage(), e); - return false; - } - return true; - } - - -} diff --git a/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java new file mode 100644 index 0000000..8645ed3 --- /dev/null +++ b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java @@ -0,0 +1,147 @@ +package eu.openaire.urls_controller.util; + +import com.google.common.collect.HashMultimap; +import eu.openaire.urls_controller.models.Task; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.configurationprocessor.json.JSONException; +import org.springframework.boot.configurationprocessor.json.JSONObject; +import org.springframework.core.io.Resource; +import org.springframework.stereotype.Component; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Scanner; + +@Component +public class TestFileUtils { + + private static final Logger logger = LoggerFactory.getLogger(TestFileUtils.class); + + @Value("classpath:testInputFiles/orderedList1000.json") + Resource testResource; + + public ThreadLocal duplicateIdUrlEntries = new ThreadLocal<>(); + public ThreadLocal inputScanner = new ThreadLocal<>(); // Every Thread has its own variable. + + private final int jsonBatchSize = 3000; + private final ThreadLocal fileIndex = new ThreadLocal<>(); + private final ThreadLocal unretrievableInputLines = new ThreadLocal<>(); + + private final String utf8Charset = "UTF-8"; + + public TestFileUtils() throws IOException { + String resourceFileName = "testInputFiles/orderedList1000.json"; + + InputStream inputStream = testResource.getInputStream(); + if ( inputStream == null ) + throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\"."); + + logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName); + + inputScanner.set(new Scanner(inputStream, utf8Charset)); + + fileIndex.set(0); + unretrievableInputLines.set(0); + duplicateIdUrlEntries.set(0); + } + + /** + * This method parses a Json file and extracts the urls, along with the IDs. + * @return HashMultimap + */ + public HashMultimap getNextIdUrlPairBatchFromJson() { + Task inputIdUrlTuple; + int expectedPathsPerID = 5; + int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; + + HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); + + int curBeginning = fileIndex.get(); + + while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) ) + {// While (!EOF) and inside the current url-batch, iterate through lines. + + //logger.debug("fileIndex: " + FileUtils.fileIndex.get()); // DEBUG! + + // Take each line, remove potential double quotes. + String retrievedLineStr = inputScanner.get().nextLine(); + //logger.debug("Loaded from inputFile: " + retrievedLineStr); // DEBUG! + + fileIndex.set(fileIndex.get() +1); + + if ( retrievedLineStr.isEmpty() ) { + unretrievableInputLines.set(unretrievableInputLines.get() +1); + continue; + } + + if ( (inputIdUrlTuple = jsonDecoder(retrievedLineStr)) == null ) { // Decode the jsonLine and take the two attributes. + logger.warn("A problematic inputLine found: \t" + retrievedLineStr); + unretrievableInputLines.set(unretrievableInputLines.get() +1); + continue; + } + + if ( !idAndUrlMappedInput.put(inputIdUrlTuple.getId(), inputIdUrlTuple.getUrl()) ) { // We have a duplicate url in the input.. log it here as we cannot pass it through the HashMultimap. It's possible that this as well as the original might be/give a docUrl. + duplicateIdUrlEntries.set(duplicateIdUrlEntries.get() +1); + } + } + + return idAndUrlMappedInput; + } + + /** + * This method decodes a Json String and returns its members. + * @param jsonLine String + * @return HashMap + */ + private Task jsonDecoder(String jsonLine) { + // Get ID and url and put them in the HashMap + String idStr = null; + String urlStr = null; + try { + JSONObject jObj = new JSONObject(jsonLine); // Construct a JSONObject from the retrieved jsonLine. + idStr = jObj.get("id").toString(); + urlStr = jObj.get("url").toString(); + } catch (JSONException je) { + logger.warn("JSONException caught when tried to parse and extract values from jsonLine: \t" + jsonLine, je); + return null; + } + + if ( urlStr.isEmpty() ) { + if ( !idStr.isEmpty() ) // If we only have the id, then go and log it. + logger.warn("The url was not found for id: \"" + idStr + "\""); + return null; + } + return new Task(idStr, urlStr, null); + } + + /** + * This method checks if there is no more input-data and returns true in that case. + * Otherwise, it returns false, if there is more input-data to be loaded. + * A "RuntimeException" is thrown if no input-urls were retrieved in general. + * @param isEmptyOfData + * @param isFirstRun + * @return finished loading / not finished + * @throws RuntimeException + */ + public boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) { + if ( isEmptyOfData ) { + if ( isFirstRun ) + logger.error("Could not retrieve any urls from the inputFile!"); + else + logger.debug("Done loading " + getCurrentlyLoadedUrls() + " urls from the inputFile."); + return true; + } + return false; + } + + /** + * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. + * @return loadedUrls + */ + private int getCurrentlyLoadedUrls() { // In the end, it gives the total number of urls we have processed. + return fileIndex.get() - unretrievableInputLines.get(); + } +} diff --git a/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java b/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java index d47155a..c1da4a4 100644 --- a/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java +++ b/src/main/java/eu/openaire/urls_controller/util/UriBuilder.java @@ -13,73 +13,72 @@ import java.net.URL; public class UriBuilder { - private static final Logger logger = LoggerFactory.getLogger(UriBuilder.class); - - public static String baseUrl = null; - - public UriBuilder(Environment environment) - { - baseUrl = "http"; - String sslEnabled = environment.getProperty("server.ssl.enabled"); - if (sslEnabled == null) { // It's expected to not exist if there is no SSL-configuration. - logger.warn("No property \"server.ssl.enabled\" was found in \"application.properties\". Continuing with plain HTTP.."); - sslEnabled = "false"; - } - baseUrl += sslEnabled.equals("true") ? "s" : ""; - baseUrl += "://"; - - String hostName = getPublicIP(); - if ( hostName == null ) - hostName = InetAddress.getLoopbackAddress().getHostName(); // Non-null. - - baseUrl += hostName; - String serverPort = environment.getProperty("server.port"); - if (serverPort == null) { // This is unacceptable! - logger.error("No property \"server.port\" was found in \"application.properties\"!"); - System.exit(-1); // Well, I guess the Spring Boot would not start in this case anyway. - } - baseUrl += ":" + serverPort; - - String baseInternalPath = environment.getProperty("server.servlet.context-path"); - if ( baseInternalPath != null ) { - if ( !baseInternalPath.startsWith("/") ) - baseUrl += "/"; - baseUrl += baseInternalPath; - if ( !baseInternalPath.endsWith("/") ) - baseUrl += "/"; - } else { - logger.warn("No property \"server.servlet.context-path\" was found in \"application.properties\"!"); // Yes it's expected. - baseUrl += "/"; - } - - logger.debug("ServerBaseURL: " + baseUrl); - } - - private static String getPublicIP() - { - String publicIpAddress = ""; - URL url_name; - try { - url_name = new URL("https://api.ipify.org/"); - } catch (MalformedURLException mue) { - logger.warn(mue.getMessage()); - return null; - } - try (BufferedReader bf = new BufferedReader(new InputStreamReader(url_name.openStream()))) { - publicIpAddress = bf.readLine().trim(); - } catch (Exception e) { - logger.warn("Cannot get the publicIP address for this machine!", e); - return null; - } - return publicIpAddress; - } - - public static String getBaseUrl() { - return baseUrl; - } - - public static void setBaseUrl(String baseUrl) { - UriBuilder.baseUrl = baseUrl; - } +// private static final Logger logger = LoggerFactory.getLogger(UriBuilder.class); +// +// public static String baseUrl = null; +// +// public UriBuilder(Environment environment) { +// baseUrl = "http"; +// String sslEnabled = environment.getProperty("server.ssl.enabled"); +// if (sslEnabled == null) { // It's expected to not exist if there is no SSL-configuration. +// logger.warn("No property \"server.ssl.enabled\" was found in \"application.properties\". Continuing with plain HTTP.."); +// sslEnabled = "false"; +// } +// baseUrl += sslEnabled.equals("true") ? "s" : ""; +// baseUrl += "://"; +// +// String hostName = getPublicIP(); +// if ( hostName == null ) +// hostName = InetAddress.getLoopbackAddress().getHostName(); // Non-null. +// +// baseUrl += hostName; +// String serverPort = environment.getProperty("server.port"); +// if (serverPort == null) { // This is unacceptable! +// logger.error("No property \"server.port\" was found in \"application.properties\"!"); +// System.exit(-1); // Well, I guess the Spring Boot would not start in this case anyway. +// } +// baseUrl += ":" + serverPort; +// +// String baseInternalPath = environment.getProperty("server.servlet.context-path"); +// if ( baseInternalPath != null ) { +// if ( !baseInternalPath.startsWith("/") ) +// baseUrl += "/"; +// baseUrl += baseInternalPath; +// if ( !baseInternalPath.endsWith("/") ) +// baseUrl += "/"; +// } else { +// logger.warn("No property \"server.servlet.context-path\" was found in \"application.properties\"!"); // Yes it's expected. +// baseUrl += "/"; +// } +// +// logger.debug("ServerBaseURL: " + baseUrl); +// } +// +// private String getPublicIP() +// { +// String publicIpAddress = ""; +// URL url_name; +// try { +// url_name = new URL("https://api.ipify.org/"); +// } catch (MalformedURLException mue) { +// logger.warn(mue.getMessage()); +// return null; +// } +// try (BufferedReader bf = new BufferedReader(new InputStreamReader(url_name.openStream()))) { +// publicIpAddress = bf.readLine().trim(); +// } catch (Exception e) { +// logger.warn("Cannot get the publicIP address for this machine!", e); +// return null; +// } +// return publicIpAddress; +// } +// +// public static String getBaseUrl() { +// return baseUrl; +// } +// +// public static void setBaseUrl(String baseUrl) { +// UriBuilder.baseUrl = baseUrl; +// } } \ No newline at end of file diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index 04f472c..c5ac15f 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -15,13 +15,27 @@ server.port = 1880 # Server api path server.servlet.context-path=/api +#Service config +services.pdfaggregation.controller.db.oldDatabaseName = pdfaggregation_i +services.pdfaggregation.controller.db.databaseName = pdfAggregationDatabase +services.pdfaggregation.controller.baseTargetLocation = /tmp/ +services.pdfaggregation.controller.maxAttemptsPerRecord = 3 +services.pdfaggregation.controller.assignmentLimit = 10000 + +services.pdfaggregation.controller.s3.endpoint = xa +services.pdfaggregation.controller.s3.accessKey = xa +services.pdfaggregation.controller.s3.secretKey = xa +services.pdfaggregation.controller.s3.region = xa +services.pdfaggregation.controller.s3.bucketName = xa +services.pdfaggregation.controller.s3.shouldEmptyBucket = false +services.pdfaggregation.controller.s3.shouldShowAllS3Buckets = true + + # Database - -spring.impala.url = jdbc:impala://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/ -# Note: The "UseNativeQuery" does not work with the PreparedStatements! Also, the url does not work without the ending "/" -# The username and the password do not matter, since this app is always run in an pre-authenticated machine. - -spring.impala.driver-class-name = com.cloudera.impala.jdbc41.Driver +spring.datasource.url=jdbc:impala://iis-cdh5-test-gw.ocean.icm.edu.pl:21050/ +spring.datasource.username= +spring.datasource.password= +spring.datasource.driver-class-name=com.cloudera.impala.jdbc41.Driver spring.datasource.hikari.pool-name=ControllerPool spring.datasource.hikari.maximumPoolSize=20 From 3741cce886586c16dd61dab169f684f9b0d2ac64 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Sun, 30 Jan 2022 22:15:13 +0200 Subject: [PATCH 03/11] springified project --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..39d1da9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +.gradle/ +.idea/ +build/ +derby.log +logs/ +src/main/main.iml From 3da6fd98e9a7734b4ea9fe1492553fbff8d96de5 Mon Sep 17 00:00:00 2001 From: antleb Date: Mon, 31 Jan 2022 04:21:31 +0200 Subject: [PATCH 04/11] added Dockerfile --- Dockerfile | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 Dockerfile diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..62db9c5 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM openjdk:8-jdk-alpine + +VOLUME /tmp +VOLUME /mnt/config +VOLUME /mnt/logs + +COPY build/libs/*-SNAPSHOT.jar app.jar + +RUN ln -s /mnt/config/application.properties ./application.properties + +ENTRYPOINT ["java","-jar","/app.jar", "--config.location=application.properties"] From 1c82088a7cc5827373c3ab7fe2f1489169e48eca Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 31 Jan 2022 13:49:14 +0200 Subject: [PATCH 05/11] fixed Value annotations --- .../configuration/ImpalaConnector.java | 4 ++-- .../controllers/TestController.java | 2 +- .../urls_controller/controllers/UrlController.java | 4 ++-- .../urls_controller/util/S3ObjectStore.java | 14 +++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index 6d381c8..1f342eb 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -19,9 +19,9 @@ public final class ImpalaConnector { @Autowired private JdbcTemplate jdbcTemplate; - @Value("services.pdfaggregation.controller.db.oldDatabaseName:pdfaggregation_i") + @Value("${services.pdfaggregation.controller.db.oldDatabaseName}:pdfaggregation_i") public static String oldDatabaseName; - @Value("services.pdfaggregation.controller.db.databaseName:pdfAggregationDatabase") + @Value("${services.pdfaggregation.controller.db.databaseName}:pdfAggregationDatabase") public static String databaseName; public static final Lock databaseLock = new ReentrantLock(true); // This lock is locking the threads trying to execute queries in the database. diff --git a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java index 433931e..457f121 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java @@ -28,7 +28,7 @@ public class TestController extends GeneralController { @Autowired private TestFileUtils fileUtils; - @Value("services.pdfaggregation.controller.assignmentLimit") + @Value("${services.pdfaggregation.controller.assignmentLimit}") private int assignmentLimit; private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 78a894b..af7cf07 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -38,10 +38,10 @@ public class UrlController { private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. private static final Pattern MALICIOUS_INPUT_STRING = Pattern.compile(".*[';`\"]+.*"); - @Value("services.pdfaggregation.controller.maxAttemptsPerRecord") + @Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") private int maxAttemptsPerRecord; - @Value("services.pdfaggregation.controller.assignmentLimit") + @Value("${services.pdfaggregation.controller.assignmentLimit}") private int assignmentLimit; @GetMapping("") diff --git a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java index 724f944..8ceab10 100644 --- a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java +++ b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java @@ -19,20 +19,20 @@ public class S3ObjectStore { private static final Logger logger = LoggerFactory.getLogger(S3ObjectStore.class); private String s3Protocol = "s3://"; - @Value("services.pdfaggregation.controller.s3.endpoint") + @Value("${services.pdfaggregation.controller.s3.endpoint}") private String endpoint = null; // This is useful to be "public", to test file-locations. - @Value("services.pdfaggregation.controller.s3.accessKey") + @Value("${services.pdfaggregation.controller.s3.accessKey}") private String accessKey = null; - @Value("services.pdfaggregation.controller.s3.secretKey") + @Value("${services.pdfaggregation.controller.s3.secretKey}") private String secretKey = null; - @Value("services.pdfaggregation.controller.s3.region") + @Value("${services.pdfaggregation.controller.s3.region}") private String region = null; - @Value("services.pdfaggregation.controller.s3.bucketName") + @Value("${services.pdfaggregation.controller.s3.bucketName}") private String bucketName = null; - @Value("services.pdfaggregation.controller.s3.shouldEmptyBucket") + @Value("${services.pdfaggregation.controller.s3.shouldEmptyBucket}") private boolean shouldEmptyBucket = false; // Set true only for testing! - @Value("services.pdfaggregation.controller.s3.shouldShowAllS3Buckets") + @Value("${services.pdfaggregation.controller.s3.shouldShowAllS3Buckets}") private boolean shouldShowAllS3Buckets = false; private MinioClient minioClient; From 9ac10fc4b3b01fe5cf3d859b36f0be5b2e922a3b Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Mon, 31 Jan 2022 14:01:26 +0200 Subject: [PATCH 06/11] fixed Value annotations --- .../urls_controller/configuration/ImpalaConnector.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index 1f342eb..6f2b09f 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -19,9 +19,9 @@ public final class ImpalaConnector { @Autowired private JdbcTemplate jdbcTemplate; - @Value("${services.pdfaggregation.controller.db.oldDatabaseName}:pdfaggregation_i") + @Value("${services.pdfaggregation.controller.db.oldDatabaseName}") public static String oldDatabaseName; - @Value("${services.pdfaggregation.controller.db.databaseName}:pdfAggregationDatabase") + @Value("${services.pdfaggregation.controller.db.databaseName}") public static String databaseName; public static final Lock databaseLock = new ReentrantLock(true); // This lock is locking the threads trying to execute queries in the database. From e9bede5c456c2361f77dcbd6c7175131c2ce6224 Mon Sep 17 00:00:00 2001 From: Antonis Lempesis Date: Tue, 1 Feb 2022 02:08:02 +0200 Subject: [PATCH 07/11] more fixes --- .../configuration/ImpalaConnector.java | 13 +++++--- .../controllers/UrlController.java | 33 ++++++++++--------- .../urls_controller/util/FileUtils.java | 14 +++++--- 3 files changed, 35 insertions(+), 25 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index 6f2b09f..b0309e9 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -19,13 +19,16 @@ public final class ImpalaConnector { @Autowired private JdbcTemplate jdbcTemplate; - @Value("${services.pdfaggregation.controller.db.oldDatabaseName}") - public static String oldDatabaseName; - @Value("${services.pdfaggregation.controller.db.databaseName}") - public static String databaseName; + private final String oldDatabaseName; + private final String databaseName; public static final Lock databaseLock = new ReentrantLock(true); // This lock is locking the threads trying to execute queries in the database. + public ImpalaConnector(@Value("${services.pdfaggregation.controller.db.oldDatabaseName}") String oldDatabaseName, + @Value("${services.pdfaggregation.controller.db.databaseName}") String databaseName) { + this.oldDatabaseName = oldDatabaseName; + this.databaseName = databaseName; + } @PostConstruct public void init() { logger.info("Max available memory to the Controller: " + Runtime.getRuntime().maxMemory() + " bytes."); @@ -60,7 +63,7 @@ public final class ImpalaConnector { jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".assignment (id string, original_url string, workerid string, `date` timestamp) stored as parquet"); jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".assignment"); - jdbcTemplate.execute("DROP TABLE IF EXISTS " + ImpalaConnector.databaseName + ".current_assignment PURGE"); + jdbcTemplate.execute("DROP TABLE IF EXISTS " + databaseName + ".current_assignment PURGE"); jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + databaseName + ".attempt (id string, original_url string, `date` timestamp, status string, error_class string, error_message string) stored as parquet"); jdbcTemplate.execute("COMPUTE STATS " + databaseName + ".attempt"); diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index af7cf07..0b9b3b9 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -44,6 +44,9 @@ public class UrlController { @Value("${services.pdfaggregation.controller.assignmentLimit}") private int assignmentLimit; + @Value("${services.pdfaggregation.controller.db.databaseName}") + private String databaseName; + @GetMapping("") public ResponseEntity getUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { @@ -72,24 +75,24 @@ public class UrlController { String findAssignmentsQuery = "select pubid, url, datasourceid, datasourcetype\n" + "from (select distinct pubid, url, datasourceid, datasourcetype, attempt_count from (\n" + "select p.id as pubid, pu.url as url, d.id as datasourceid, d.type as datasourcetype, attempts.counts as attempt_count\n" + - "from " + ImpalaConnector.databaseName + ".publication p\n" + - "join " + ImpalaConnector.databaseName + ".publication_urls pu on pu.id=p.id\n" + - "join " + ImpalaConnector.databaseName + ".datasource d on d.id=p.datasourceid\n" + - "left outer join (select count(a.id) as counts, a.id from " + ImpalaConnector.databaseName + ".attempt a group by a.id) as attempts on attempts.id=p.id\n" + + "from " + databaseName + ".publication p\n" + + "join " + databaseName + ".publication_urls pu on pu.id=p.id\n" + + "join " + databaseName + ".datasource d on d.id=p.datasourceid\n" + + "left outer join (select count(a.id) as counts, a.id from " + databaseName + ".attempt a group by a.id) as attempts on attempts.id=p.id\n" + "left outer join (\n" + - " select a.id, a.original_url from " + ImpalaConnector.databaseName + ".assignment a\n" + + " select a.id, a.original_url from " + databaseName + ".assignment a\n" + " union all\n" + - " select pl.id, pl.original_url from " + ImpalaConnector.databaseName + ".payload pl) as existing on existing.id=p.id and existing.original_url=pu.url\n" + - "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecord + " and not exists (select 1 from " + ImpalaConnector.databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + + " select pl.id, pl.original_url from " + databaseName + ".payload pl) as existing on existing.id=p.id and existing.original_url=pu.url\n" + + "where d.allow_harvest=true and existing.id is null and coalesce(attempts.counts, 0) <= " + maxAttemptsPerRecord + " and not exists (select 1 from " + databaseName + ".attempt a where a.id=p.id and a.error_class = 'noRetry' limit 1)\n" + "limit " + (assignmentsLimit * 10) + ") as non_distinct_results\n" + "order by coalesce(attempt_count, 0), reverse(pubid), url\n" + "limit " + assignmentsLimit + ") as findAssignmentsQuery"; // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. - String createAssignmentsQuery = "create table " + ImpalaConnector.databaseName + ".current_assignment as \n" + findAssignmentsQuery; - String computeCurrentAssignmentsStatsQuery = "COMPUTE STATS " + ImpalaConnector.databaseName + ".current_assignment"; - String getAssignmentsQuery = "select * from " + ImpalaConnector.databaseName + ".current_assignment"; + String createAssignmentsQuery = "create table " + databaseName + ".current_assignment as \n" + findAssignmentsQuery; + String computeCurrentAssignmentsStatsQuery = "COMPUTE STATS " + databaseName + ".current_assignment"; + String getAssignmentsQuery = "select * from " + databaseName + ".current_assignment"; List assignments = new ArrayList<>(); @@ -171,8 +174,8 @@ public class UrlController { // Write the Assignment details to the assignment-table. // The "timestamp" is generated from the Java-code, so it's in no way provided by a 3rd party. - String insertAssignmentsQuery = "insert into " + ImpalaConnector.databaseName + ".assignment \n select pub_data.pubid, pub_data.url, '" + workerId + "', cast('" + timestamp + "' as timestamp)\n" - + "from (\n select pubid, url from " + ImpalaConnector.databaseName + ".current_assignment) as pub_data"; + String insertAssignmentsQuery = "insert into " + databaseName + ".assignment \n select pub_data.pubid, pub_data.url, '" + workerId + "', cast('" + timestamp + "' as timestamp)\n" + + "from (\n select pubid, url from " + databaseName + ".current_assignment) as pub_data"; try { jdbcTemplate.execute(insertAssignmentsQuery); @@ -252,8 +255,8 @@ public class UrlController { ImpalaConnector.databaseLock.lock(); // Store the workerReport into the database. - String insertIntoPayloadBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; - String insertIntoAttemptBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; + String insertIntoPayloadBaseQuery = "INSERT INTO " + databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; + String insertIntoAttemptBaseQuery = "INSERT INTO " + databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; String payloadErrorMsg = null; int failedCount = 0; @@ -372,7 +375,7 @@ public class UrlController { } private String dropCurrentAssignmentTable() { - String dropCurrentAssignmentsQuery = "DROP TABLE " + ImpalaConnector.databaseName + ".current_assignment PURGE"; + String dropCurrentAssignmentsQuery = "DROP TABLE " + databaseName + ".current_assignment PURGE"; try { jdbcTemplate.execute(dropCurrentAssignmentsQuery); diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 1e08476..9030a1c 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -40,6 +40,10 @@ public class FileUtils { @Autowired private FileUnZipper fileUnZipper; + @Value("${services.pdfaggregation.controller.db.databaseName}") + private String databaseName; + + public enum UploadFullTextsResponse {successful, unsuccessful, databaseError} public FileUtils() throws RuntimeException { @@ -69,10 +73,10 @@ public class FileUtils { parameter = " '" + parameter + "'"; // This will be a "string-check". try { - jdbcTemplate.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName + " " + whereClause + parameter); - jdbcTemplate.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE"); - jdbcTemplate.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName); - jdbcTemplate.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName); + jdbcTemplate.execute("CREATE TABLE " + databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + databaseName + "." + tableName + " " + whereClause + parameter); + jdbcTemplate.execute("DROP TABLE " + databaseName + "." + tableName + " PURGE"); + jdbcTemplate.execute("ALTER TABLE " + databaseName + "." + tableName + "_tmp RENAME TO " + databaseName + "." + tableName); + jdbcTemplate.execute("COMPUTE STATS " + databaseName + "." + tableName); } catch (DataAccessException e) { errorMsg = "Problem when executing the \"clone-drop-rename\" queries!\n"; logger.error(errorMsg, e); @@ -104,7 +108,7 @@ public class FileUtils { ImpalaConnector.databaseLock.lock(); - String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ?" ; + String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ?" ; // Get the file-locations. int numFullTextUrlsFound = 0; From 35966b6f6e2f64cb51d53b4df5df8cc086b6ba76 Mon Sep 17 00:00:00 2001 From: antleb Date: Tue, 1 Feb 2022 16:57:28 +0200 Subject: [PATCH 08/11] finishing toucehs --- Dockerfile | 8 ++--- .../configuration/ImpalaConnector.java | 2 +- .../controllers/TestController.java | 2 ++ .../urls_controller/util/TestFileUtils.java | 4 +-- src/main/resources/logback-spring.xml | 33 ------------------- 5 files changed, 7 insertions(+), 42 deletions(-) delete mode 100644 src/main/resources/logback-spring.xml diff --git a/Dockerfile b/Dockerfile index 62db9c5..e567ddd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,11 +1,7 @@ FROM openjdk:8-jdk-alpine -VOLUME /tmp -VOLUME /mnt/config -VOLUME /mnt/logs - COPY build/libs/*-SNAPSHOT.jar app.jar -RUN ln -s /mnt/config/application.properties ./application.properties +EXPOSE 1880 -ENTRYPOINT ["java","-jar","/app.jar", "--config.location=application.properties"] +ENTRYPOINT ["java","-jar","/app.jar", "--spring.config.location=file:///mnt/config/application.properties"] diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index b0309e9..0a3fd41 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -12,7 +12,7 @@ import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReentrantLock; @Repository -public final class ImpalaConnector { +public class ImpalaConnector { private static final Logger logger = LoggerFactory.getLogger(ImpalaConnector.class); diff --git a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java index 457f121..1eeb0ed 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java @@ -14,6 +14,7 @@ import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import java.sql.Timestamp; @@ -21,6 +22,7 @@ import java.util.*; import java.util.concurrent.atomic.AtomicLong; @RestController +@RequestMapping("/test") public class TestController extends GeneralController { private static final Logger logger = LoggerFactory.getLogger(TestController.class); diff --git a/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java index 8645ed3..fd10b39 100644 --- a/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java @@ -8,6 +8,7 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; import org.springframework.core.io.Resource; +import org.springframework.core.io.ClassPathResource; import org.springframework.stereotype.Component; import java.io.File; @@ -20,8 +21,7 @@ public class TestFileUtils { private static final Logger logger = LoggerFactory.getLogger(TestFileUtils.class); - @Value("classpath:testInputFiles/orderedList1000.json") - Resource testResource; + Resource testResource = new ClassPathResource("testInputFiles/orderedList1000.json"); public ThreadLocal duplicateIdUrlEntries = new ThreadLocal<>(); public ThreadLocal inputScanner = new ThreadLocal<>(); // Every Thread has its own variable. diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml deleted file mode 100644 index 86ba426..0000000 --- a/src/main/resources/logback-spring.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - logs/UrlsController.log - - - logs/UrlsController.%i.log.zip - 1 - 20 - - - - 50MB - - - - UTF-8 - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n - - - - - - UTF-8 - %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %highlight(%-5level) %cyan(%logger{36}.%M\(@%line\)) - %msg%n - - - - - - - - \ No newline at end of file From be4898e43e701237c6236ee71157f10f4dcc9865 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 2 Feb 2022 20:19:46 +0200 Subject: [PATCH 09/11] Bug fixes and improvements: - Fix an NPE, when the "getTestUrls"-endpoint is called. It was thrown because of an absent per-thread initialization of some thread-local variables. - Fix JdbcTemplate error when querying the "getFileLocationForHashQuery". - Fix the "S3ObjectStore.isLocationInStore" check. - Fix not catching/handling some exceptions. - Fix/improve log-messages. - Optimize the "getFileLocationForHashQuery" to return only the first row. In the latest change, without this optimization, the query-result would cause non-handling the same-hash cases, because of an exception. - Optimize the "ImpalaConnector.databaseLock.lock()" positioning. - Update the "getTestUrls" api-path. - Optimize list-allocation. - Re-add the info-message about the successful emptying of the S3-bucket. - Code cleanup. --- .../configuration/ImpalaConnector.java | 18 ++- .../controllers/GeneralController.java | 1 + .../controllers/TestController.java | 22 ++-- .../controllers/UrlController.java | 120 ++++++++---------- .../urls_controller/util/FileUtils.java | 50 ++++---- .../urls_controller/util/S3ObjectStore.java | 27 ++-- .../urls_controller/util/TestFileUtils.java | 37 +++--- 7 files changed, 140 insertions(+), 135 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index 0a3fd41..195fad3 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -24,25 +24,27 @@ public class ImpalaConnector { public static final Lock databaseLock = new ReentrantLock(true); // This lock is locking the threads trying to execute queries in the database. + public ImpalaConnector(@Value("${services.pdfaggregation.controller.db.oldDatabaseName}") String oldDatabaseName, @Value("${services.pdfaggregation.controller.db.databaseName}") String databaseName) { this.oldDatabaseName = oldDatabaseName; this.databaseName = databaseName; } + + @PostConstruct public void init() { logger.info("Max available memory to the Controller: " + Runtime.getRuntime().maxMemory() + " bytes."); - try { - if ( jdbcTemplate.getDataSource().getConnection().getMetaData().supportsBatchUpdates() ) - logger.warn("The database does not support \"BatchUpdates\"!"); + boolean supportsBatchUpdates = jdbcTemplate.getDataSource().getConnection().getMetaData().supportsBatchUpdates(); + logger.info("The database " + (supportsBatchUpdates ? "supports" : "does not support") + " \"BatchUpdates\"!"); } catch (Exception e) { - logger.error("Error testing if database supports batch updates", e); + logger.error("Error testing if database supports batch updates!", e); } - - createDatabase(); + createDatabase(); // In case of an exception, the App will exit with the stacktrace. } + private void createDatabase() { logger.info("Going to create the database and the tables, if they do not exist. Also will fill some tables with data from OpenAIRE."); @@ -74,9 +76,11 @@ public class ImpalaConnector { logger.info("The database \"" + databaseName + "\" and its tables were created or validated."); } - public static String handlePreparedStatementException(String queryName, String query, Exception e) { + + public static String handleQueryException(String queryName, String query, Exception e) { String errorMsg = "Problem when creating " + (( ! queryName.startsWith("get")) ? "and executing " : "") + "the prepared statement for \"" + queryName + "\"!\n"; logger.error(errorMsg + "\n\n" + query + "\n\n", e); return errorMsg; } + } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/GeneralController.java b/src/main/java/eu/openaire/urls_controller/controllers/GeneralController.java index 9af2840..8050c21 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/GeneralController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/GeneralController.java @@ -21,4 +21,5 @@ public class GeneralController { return ResponseEntity.ok().build(); } + } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java index 1eeb0ed..2ac6dbd 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/TestController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/TestController.java @@ -13,32 +13,35 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RequestMapping; +import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; import java.sql.Timestamp; import java.util.*; import java.util.concurrent.atomic.AtomicLong; + @RestController @RequestMapping("/test") -public class TestController extends GeneralController { +public class TestController { private static final Logger logger = LoggerFactory.getLogger(TestController.class); @Autowired - private TestFileUtils fileUtils; + private TestFileUtils testFileUtils; @Value("${services.pdfaggregation.controller.assignmentLimit}") private int assignmentLimit; - private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. + private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); - @GetMapping("test") + + @GetMapping("urls") public ResponseEntity getTestUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " test-assignments. The assignments-limit of the controller is: " + this.assignmentLimit); + logger.debug("Going to retrieve the data from the inputResourceFile: " + testFileUtils.testResource.getFilename()); List assignments = new ArrayList<>(); HashMultimap loadedIdUrlPairs; @@ -48,9 +51,9 @@ public class TestController extends GeneralController { // Start loading urls. while ( true ) { - loadedIdUrlPairs = fileUtils.getNextIdUrlPairBatchFromJson(); // Take urls from jsonFile. + loadedIdUrlPairs = testFileUtils.getNextIdUrlPairBatchFromJson(); // Take urls from jsonFile. - if ( fileUtils.isFinishedLoading(loadedIdUrlPairs.isEmpty(), isFirstRun) ) // Throws RuntimeException which is automatically passed on. + if ( testFileUtils.isFinishedLoading(loadedIdUrlPairs.isEmpty(), isFirstRun) ) // Throws RuntimeException which is automatically passed on. break; else isFirstRun = false; @@ -73,12 +76,13 @@ public class TestController extends GeneralController { } }// end loading-while-loop - Scanner scanner = fileUtils.inputScanner.get(); + Scanner scanner = testFileUtils.inputScanner.get(); if ( scanner != null ) // Check if the initial value is null. scanner.close(); long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); - logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + fileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); + logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + testFileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); } + } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 0b9b3b9..19e4cc6 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -13,7 +13,6 @@ import org.springframework.beans.factory.annotation.Value; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.jdbc.core.JdbcTemplate; -import org.springframework.jdbc.core.RowCallbackHandler; import org.springframework.web.bind.annotation.*; import javax.servlet.http.HttpServletRequest; @@ -23,6 +22,7 @@ import java.util.List; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; + @RestController @RequestMapping("/urls") public class UrlController { @@ -35,7 +35,7 @@ public class UrlController { @Autowired private FileUtils fileUtils; - private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint. + private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); private static final Pattern MALICIOUS_INPUT_STRING = Pattern.compile(".*[';`\"]+.*"); @Value("${services.pdfaggregation.controller.maxAttemptsPerRecord}") @@ -47,6 +47,7 @@ public class UrlController { @Value("${services.pdfaggregation.controller.db.databaseName}") private String databaseName; + @GetMapping("") public ResponseEntity getUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) { @@ -90,78 +91,77 @@ public class UrlController { // The "order by" in the end makes sure the older attempted records will be re-attempted after a long time. - String createAssignmentsQuery = "create table " + databaseName + ".current_assignment as \n" + findAssignmentsQuery; + String createCurrentAssignmentsQuery = "create table " + databaseName + ".current_assignment as \n" + findAssignmentsQuery; String computeCurrentAssignmentsStatsQuery = "COMPUTE STATS " + databaseName + ".current_assignment"; String getAssignmentsQuery = "select * from " + databaseName + ".current_assignment"; - List assignments = new ArrayList<>(); + List assignments = new ArrayList<>(assignmentsLimit); ImpalaConnector.databaseLock.lock(); try { - jdbcTemplate.execute(createAssignmentsQuery); + jdbcTemplate.execute(createCurrentAssignmentsQuery); } catch (Exception sqle) { ImpalaConnector.databaseLock.unlock(); + String errorMsg = ImpalaConnector.handleQueryException("createCurrentAssignmentsQuery", createCurrentAssignmentsQuery, sqle); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } try { jdbcTemplate.execute(computeCurrentAssignmentsStatsQuery); } catch (Exception sqle) { String errorMsg = dropCurrentAssignmentTable(); + ImpalaConnector.databaseLock.unlock(); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. + if ( errorMsg != null ) return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - ImpalaConnector.databaseLock.unlock(); - errorMsg = ImpalaConnector.handlePreparedStatementException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, sqle); + errorMsg = ImpalaConnector.handleQueryException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, sqle); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records. try { - jdbcTemplate.query(getAssignmentsQuery, new RowCallbackHandler() { - @Override - public void processRow(ResultSet rs) throws SQLException { - Assignment assignment = new Assignment(); - assignment.setWorkerId(workerId); - assignment.setTimestamp(timestamp); - Datasource datasource = new Datasource(); - try { // For each of the 4 columns returned. The indexing starts from 1 - assignment.setId(rs.getString(1)); - assignment.setOriginalUrl(rs.getString(2)); - datasource.setId(rs.getString(3)); - datasource.setName(rs.getString(4)); - } catch (SQLException sqle) { - logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle); - } - assignment.setDatasource(datasource); - assignments.add(assignment); + jdbcTemplate.query(getAssignmentsQuery, rs -> { + Assignment assignment = new Assignment(); + assignment.setWorkerId(workerId); + assignment.setTimestamp(timestamp); + Datasource datasource = new Datasource(); + try { // For each of the 4 columns returned. The indexing starts from 1 + assignment.setId(rs.getString(1)); + assignment.setOriginalUrl(rs.getString(2)); + datasource.setId(rs.getString(3)); + datasource.setName(rs.getString(4)); + } catch (SQLException sqle) { + logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle); } + assignment.setDatasource(datasource); + assignments.add(assignment); }); } catch (Exception e) { String errorMsg = dropCurrentAssignmentTable(); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); - ImpalaConnector.databaseLock.unlock(); + if ( errorMsg != null ) + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + errorMsg = "Problem when executing the \"getAssignmentsQuery\"!\n"; logger.error(errorMsg, e); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } int assignmentsSize = assignments.size(); if ( assignmentsSize == 0 ) { String errorMsg = dropCurrentAssignmentTable(); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); ImpalaConnector.databaseLock.unlock(); + + if ( errorMsg != null ) + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + maxAttemptsPerRecord += 2; // Increase the max-attempts to try again some very old records, in the next requests. errorMsg = "No results retrieved from the \"findAssignmentsQuery\" for worker with id: " + workerId + ". Will increase the \"maxAttempts\" to " + maxAttemptsPerRecord + " for the next requests."; logger.error(errorMsg); - return ResponseEntity.status(HttpStatus.NO_CONTENT).body(errorMsg); } else if ( assignmentsSize < assignmentsLimit ) { maxAttemptsPerRecord += 2; // Increase the max-attempts to try again some very old records, in the next requests. @@ -181,23 +181,26 @@ public class UrlController { jdbcTemplate.execute(insertAssignmentsQuery); } catch (Exception sqle) { String errorMsg = dropCurrentAssignmentTable(); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); ImpalaConnector.databaseLock.unlock(); - errorMsg = ImpalaConnector.handlePreparedStatementException("insertAssignmentsQuery", insertAssignmentsQuery, sqle); + + if ( errorMsg != null ) + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + + errorMsg = ImpalaConnector.handleQueryException("insertAssignmentsQuery", insertAssignmentsQuery, sqle); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } String errorMsg = dropCurrentAssignmentTable(); - if ( errorMsg != null ) // The "databaseLock" is already unlocked. + if ( errorMsg != null ) { + ImpalaConnector.databaseLock.unlock(); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + } logger.debug("Finished inserting " + assignmentsSize + " assignments into the \"assignment\"-table. Going to merge the parquet files for this table."); String mergeErrorMsg = fileUtils.mergeParquetFiles("assignment", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } @@ -208,6 +211,7 @@ public class UrlController { return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); } + @PostMapping("addWorkerReport") public ResponseEntity addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) { @@ -248,44 +252,38 @@ public class UrlController { } else if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.unsuccessful ) { logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments); - // The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads. + // The docUrls were still found! Just update ALL the fileLocations, sizes, hashes and mimetypes, to show that the files are not available and continue with writing the attempts and the payloads. fileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports); } - ImpalaConnector.databaseLock.lock(); - - // Store the workerReport into the database. + // Store the workerReport into the database. We use "PreparedStatements" to do insertions, for security and valid SQL syntax reasons. String insertIntoPayloadBaseQuery = "INSERT INTO " + databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; String insertIntoAttemptBaseQuery = "INSERT INTO " + databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; - String payloadErrorMsg = null; int failedCount = 0; - // TODO - Think about handling this loop with multiple threads.. - // The Impala-server will handle the synchronization itself.. - // Check online what happens with "statement.setPoolable()" does it improves speed? in multi or also in single thread? + ImpalaConnector.databaseLock.lock(); + // TODO - Think about handling this loop with multiple threads.. The Impala-server will handle the synchronization itself.. for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); - if ( payload == null ) { logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments); payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!"; - continue; } - try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. + try { + Long size = payload.getSize(); Object[] args = new Object[] { payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(), - payload.getMime_type(), payload.getSize() != null?String.valueOf(payload.getSize()):null, payload.getHash(), + payload.getMime_type(), (size != null) ? String.valueOf(size) : null, payload.getHash(), payload.getLocation(), payload.getProvenance()}; int[] argTypes = new int[] { Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; jdbcTemplate.update(insertIntoPayloadBaseQuery, args, argTypes); - } catch (Exception sqle) { logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle); } @@ -300,17 +298,14 @@ public class UrlController { Object[] args = new Object[] { payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(), urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()}; - int[] argTypes = new int[] { - Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, - Types.VARCHAR}; + int[] argTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; jdbcTemplate.update(insertIntoAttemptBaseQuery, args, argTypes); } catch (Exception sqle) { - logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": ", sqle.getMessage()); + logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage()); } }//end for-loop - if ( payloadErrorMsg != null ) logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables, although " + payloadErrorMsg + " Going to merge the parquet files for those tables."); else @@ -319,33 +314,30 @@ public class UrlController { String mergeErrorMsg = fileUtils.mergeParquetFiles("payload", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } mergeErrorMsg = fileUtils.mergeParquetFiles("attempt", "", null); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } // This will delete the rows of the "assignment" table which refer to the curWorkerId. As we have non-kudu Impala tables, the Delete operation can only succeed through a "merge" operation of the rest of the data. // Only the rows referring to OTHER workerIDs get stored in a temp-table, while the "assignment" table gets deleted. Then, the temp_table becomes the "assignment" table. - // We do not need to keep the assignment-info anymore, the "findAssignmentsQuery" checks the payload table for previously handled tasks. + // We don't need to keep the assignment-info anymore, the "findAssignmentsQuery" checks the payload table for previously handled tasks. mergeErrorMsg = fileUtils.mergeParquetFiles("assignment", " WHERE workerid != ", curWorkerId); if ( mergeErrorMsg != null ) { ImpalaConnector.databaseLock.unlock(); - return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } ImpalaConnector.databaseLock.unlock(); - logger.debug("Finished merging the database tables."); return ResponseEntity.status(HttpStatus.OK).body(payloadErrorMsg); } + // The "batchExecute" does not work in this Impala-Database, so this is a "giant-query" solution. // Note: this causes an "Out of memory"-ERROR in the current version of the Impala JDBC driver. If a later version is provided, then this code should be tested. private static PreparedStatement constructLargeInsertQuery(Connection con, String baseInsertQuery, int dataSize, int numParamsPerRow) throws RuntimeException { @@ -374,15 +366,15 @@ public class UrlController { return preparedInsertStatement; } + private String dropCurrentAssignmentTable() { String dropCurrentAssignmentsQuery = "DROP TABLE " + databaseName + ".current_assignment PURGE"; - try { jdbcTemplate.execute(dropCurrentAssignmentsQuery); return null; - } catch (Exception sqle) { - ImpalaConnector.databaseLock.unlock(); - return ImpalaConnector.handlePreparedStatementException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, sqle); + } catch (Exception e) { + return ImpalaConnector.handleQueryException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, e); } } + } diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 9030a1c..3936dea 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -4,12 +4,12 @@ import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.configuration.ImpalaConnector; import eu.openaire.urls_controller.models.Payload; import eu.openaire.urls_controller.models.UrlReport; -import org.codehaus.groovy.syntax.Types; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.dao.DataAccessException; +import org.springframework.dao.EmptyResultDataAccessException; import org.springframework.jdbc.core.JdbcTemplate; import org.springframework.stereotype.Component; @@ -20,6 +20,7 @@ import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.sql.Types; import java.util.ArrayList; import java.util.List; import java.util.Set; @@ -46,8 +47,6 @@ public class FileUtils { public enum UploadFullTextsResponse {successful, unsuccessful, databaseError} - public FileUtils() throws RuntimeException { - } /** * In each insertion, a new parquet-file is created, so we end up with millions of files. Parquet is great for fast-select, so have to stick with it and merge those files.. @@ -70,7 +69,7 @@ public class FileUtils { if ( parameter == null ) parameter = ""; else - parameter = " '" + parameter + "'"; // This will be a "string-check". + parameter = " '" + parameter + "'"; // This will be a "string-check", thus the single-quotes. try { jdbcTemplate.execute("CREATE TABLE " + databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + databaseName + "." + tableName + " " + whereClause + parameter); @@ -86,6 +85,7 @@ public class FileUtils { return null; // No errorMsg, everything is fine. } + private final Pattern FILENAME_ID = Pattern.compile("([\\w_:]+)\\.[\\w]{2,10}$"); private final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:]+\\.[\\w]{2,10})$"); @@ -94,6 +94,7 @@ public class FileUtils { private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). + public UploadFullTextsResponse getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { // The Controller have to request the files from the Worker, in order to upload them to the S3. // We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database. @@ -106,14 +107,13 @@ public class FileUtils { if ( remoteAddr == null || "".equals(remoteAddr) ) remoteAddr = request.getRemoteAddr(); - ImpalaConnector.databaseLock.lock(); - - String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ?" ; - // Get the file-locations. int numFullTextUrlsFound = 0; int numFilesFoundFromPreviousAssignmentsBatches = 0; HashMultimap allFileNamesWithIDsHashMap = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. + String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ? limit 1" ; + + ImpalaConnector.databaseLock.lock(); for ( UrlReport urlReport : urlReports ) { UrlReport.StatusType statusType = urlReport.getStatus(); @@ -126,7 +126,7 @@ public class FileUtils { if ( payload == null ) continue; - String fileLocation; + String fileLocation = null; // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. @@ -134,21 +134,23 @@ public class FileUtils { // Use the same prepared-statement for all requests, to improve speed (just like when inserting similar thing to the DB). String fileHash = payload.getHash(); if ( fileHash != null ) { - - fileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, new int[] {Types.STRING}, String.class); + try { + fileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, new int[] {Types.VARCHAR}, String.class); + } catch (EmptyResultDataAccessException erdae) { + // No fileLocation is found, it's ok. It will be null by default. + } catch (Exception e) { + logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n", e); + // TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database?? + // TODO - Since the database will have problems.. there is no point in trying to insert the payloads to Impala (we will handle it like: we tried to insert and got an error). + // TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not auto-closed here)and the Database connection. + } if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded. payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical. - //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG! numFilesFoundFromPreviousAssignmentsBatches ++; - continue; // Do not request the file from the worker, it's already uploaded. Move on. } - - // TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database?? - // TODO - Since the database will have problems.. there is no point in trying to insert the payloads to Impala (we will handle it like: we tried to insert and got an error). - // TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not auto-closed here)and the Database connection. } // If the full-text of this record was not found by a previous batch... @@ -165,9 +167,9 @@ public class FileUtils { allFileNamesWithIDsHashMap.put(fileNameWithExtension, payload.getId()); // The keys and the values are not duplicate. Task with ID-1 might have an "ID-1.pdf" file. // While a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again. } - } + }// end-for - ImpalaConnector.databaseLock.unlock(); // The rest work of this function does not use the database. + ImpalaConnector.databaseLock.unlock(); // The remaining work of this function does not use the database. logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ")."); logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches); @@ -259,10 +261,12 @@ public class FileUtils { // At this point, we know that this file is related with one or more IDs of the payloads AND it has a valid fileName. // Let's try to upload the file to S3 and update the payloads of all related IDs, either in successful upload or not. - String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); - if ( s3Url != null ) { - setFullTextForMultipleIDs(fileRelatedIDs, payloadsHashMultimap, s3Url); // It checks weather (s3Url != null) and acts accordingly. + try { + String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); + setFullTextForMultipleIDs(fileRelatedIDs, payloadsHashMultimap, s3Url); numUploadedFiles ++; + } catch (Exception e) { + logger.error("Could not upload the file \"" + fileName + "\" to the S3 ObjectStore, exception: " + e.getMessage(), e); } // Else, the record will have its file-data set to "null", in the end of this method. } @@ -448,7 +452,7 @@ public class FileUtils { if ( payload != null ) { String fileLocation = payload.getLocation(); - if ( (fileLocation != null) && (! s3ObjectStore.locationInStore(fileLocation)) ) + if ( (fileLocation != null) && (! s3ObjectStore.isLocationInStore(fileLocation)) ) setUnretrievedFullText(payload); } } diff --git a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java index 8ceab10..e60fcfe 100644 --- a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java +++ b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java @@ -50,11 +50,11 @@ public class S3ObjectStore { } // Make the bucket, if not exist. - if ( !bucketExists ) { - logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it.."); - minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build()); - } else - logger.debug("Bucket \"" + bucketName + "\" already exists."); + if ( !bucketExists ) { + logger.info("Bucket \"" + bucketName + "\" does not exist! Going to create it.."); + minioClient.makeBucket(MakeBucketArgs.builder().bucket(bucketName).build()); + } else + logger.debug("Bucket \"" + bucketName + "\" already exists."); if ( shouldShowAllS3Buckets ) { List buckets = null; @@ -89,10 +89,10 @@ public class S3ObjectStore { else { if ( extension.equals("pdf") ) contentType = "application/pdf"; - /*else if ( *//* TODO - other-extension-match *//* ) - contentType = "application/pdf"; */ + /*else if ( *//* TODO - other-extension-match *//* ) + contentType = "application/EXTENSION"; */ else - contentType = "application/pdf"; + contentType = "application/pdf"; // Default. } } else { logger.warn("The file with key \"" + fileObjKeyName + "\" does not have a file-extension! Setting the \"pdf\"-mimeType."); @@ -104,8 +104,9 @@ public class S3ObjectStore { .object(fileObjKeyName).filename(fileFullPath) .contentType(contentType).build()); - // TODO - What if the fileObjKeyName already exists? - // Right now it gets overwritten (unless we add versioning, which is irrelevant for different objects..) + // TODO - What if the fileObjKeyName already exists? Right now it gets overwritten (unless we add versioning0, which is not currently supported by our S3ObjectStore). + // Each Worker handles some of these cases, but in case of id-urls splitting between different workers or re-attempting some temporarily faulty urls later, + // duplicate fileNames may appear and cause file-overwriting from the part of S3ObjectStore. String s3Url = s3Protocol + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public. //logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url); @@ -130,10 +131,12 @@ public class S3ObjectStore { // Lastly, delete the empty bucket. minioClient.removeBucket(RemoveBucketArgs.builder().bucket(bucketName).build()); } + + logger.info("Bucket " + bucketName + " was " + (shouldDeleteBucket ? "deleted!" : "emptied!")); } - public boolean locationInStore(String location) { - return location.startsWith(endpoint); + public boolean isLocationInStore(String location) { + return location.startsWith(s3Protocol); } private void deleteFile(String fileObjKeyName, String bucketName) throws Exception { diff --git a/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java index fd10b39..1386445 100644 --- a/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/TestFileUtils.java @@ -4,14 +4,12 @@ import com.google.common.collect.HashMultimap; import eu.openaire.urls_controller.models.Task; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Value; import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; -import org.springframework.core.io.Resource; import org.springframework.core.io.ClassPathResource; +import org.springframework.core.io.Resource; import org.springframework.stereotype.Component; -import java.io.File; import java.io.IOException; import java.io.InputStream; import java.util.Scanner; @@ -21,33 +19,30 @@ public class TestFileUtils { private static final Logger logger = LoggerFactory.getLogger(TestFileUtils.class); - Resource testResource = new ClassPathResource("testInputFiles/orderedList1000.json"); + public Resource testResource = new ClassPathResource("testInputFiles/orderedList1000.json"); - public ThreadLocal duplicateIdUrlEntries = new ThreadLocal<>(); - public ThreadLocal inputScanner = new ThreadLocal<>(); // Every Thread has its own variable. + public ThreadLocal duplicateIdUrlEntries; + public ThreadLocal inputScanner; private final int jsonBatchSize = 3000; - private final ThreadLocal fileIndex = new ThreadLocal<>(); - private final ThreadLocal unretrievableInputLines = new ThreadLocal<>(); + private ThreadLocal fileIndex; + private ThreadLocal unretrievableInputLines; private final String utf8Charset = "UTF-8"; - public TestFileUtils() throws IOException { - String resourceFileName = "testInputFiles/orderedList1000.json"; + public TestFileUtils() throws IOException { InputStream inputStream = testResource.getInputStream(); if ( inputStream == null ) - throw new RuntimeException("No resourceFile was found with name \"" + resourceFileName + "\"."); + throw new RuntimeException("No resourceFile was found with name \"" + testResource.getFilename() + "\"!"); - logger.debug("Going to retrieve the data from the inputResourceFile: " + resourceFileName); - - inputScanner.set(new Scanner(inputStream, utf8Charset)); - - fileIndex.set(0); - unretrievableInputLines.set(0); - duplicateIdUrlEntries.set(0); + inputScanner = ThreadLocal.withInitial(() -> new Scanner(inputStream, utf8Charset)); + fileIndex = ThreadLocal.withInitial(() -> 0); + unretrievableInputLines = ThreadLocal.withInitial(() -> 0); + duplicateIdUrlEntries = ThreadLocal.withInitial(() -> 0); } + /** * This method parses a Json file and extracts the urls, along with the IDs. * @return HashMultimap @@ -58,7 +53,6 @@ public class TestFileUtils { int expectedIDsPerBatch = jsonBatchSize / expectedPathsPerID; HashMultimap idAndUrlMappedInput = HashMultimap.create(expectedIDsPerBatch, expectedPathsPerID); - int curBeginning = fileIndex.get(); while ( inputScanner.get().hasNextLine() && (fileIndex.get() < (curBeginning + jsonBatchSize)) ) @@ -91,6 +85,7 @@ public class TestFileUtils { return idAndUrlMappedInput; } + /** * This method decodes a Json String and returns its members. * @param jsonLine String @@ -117,6 +112,7 @@ public class TestFileUtils { return new Task(idStr, urlStr, null); } + /** * This method checks if there is no more input-data and returns true in that case. * Otherwise, it returns false, if there is more input-data to be loaded. @@ -124,7 +120,6 @@ public class TestFileUtils { * @param isEmptyOfData * @param isFirstRun * @return finished loading / not finished - * @throws RuntimeException */ public boolean isFinishedLoading(boolean isEmptyOfData, boolean isFirstRun) { if ( isEmptyOfData ) { @@ -137,6 +132,7 @@ public class TestFileUtils { return false; } + /** * This method returns the number of (non-heading, non-empty) lines we have read from the inputFile. * @return loadedUrls @@ -144,4 +140,5 @@ public class TestFileUtils { private int getCurrentlyLoadedUrls() { // In the end, it gives the total number of urls we have processed. return fileIndex.get() - unretrievableInputLines.get(); } + } From 6aab1d242b3ff878295f3e1c38293f1b2ceb385e Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 4 Feb 2022 14:48:22 +0200 Subject: [PATCH 10/11] - Improve performance when handling WorkerReports' database insertions, by using parallelism to insert to two different tables in the same time. Also, pre-cache the query-argument-types. - Update the error-message and counting system, on partial insertion event. --- .../openaire/urls_controller/Application.java | 32 ++++- .../controllers/UrlController.java | 129 +++++++++++------- .../urls_controller/util/FileUtils.java | 4 +- 3 files changed, 112 insertions(+), 53 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/Application.java b/src/main/java/eu/openaire/urls_controller/Application.java index 1803e0b..8f65f93 100644 --- a/src/main/java/eu/openaire/urls_controller/Application.java +++ b/src/main/java/eu/openaire/urls_controller/Application.java @@ -1,7 +1,8 @@ package eu.openaire.urls_controller; -import eu.openaire.urls_controller.util.UriBuilder; -import org.springframework.boot.CommandLineRunner; +import eu.openaire.urls_controller.controllers.UrlController; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.springframework.boot.SpringApplication; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.context.annotation.Bean; @@ -11,13 +12,17 @@ import org.springframework.web.cors.CorsConfiguration; import org.springframework.web.cors.CorsConfigurationSource; import org.springframework.web.cors.UrlBasedCorsConfigurationSource; +import javax.annotation.PreDestroy; import java.util.Arrays; import java.util.Collections; +import java.util.concurrent.TimeUnit; @SpringBootApplication @EnableScheduling public class Application { + private static final Logger logger = LoggerFactory.getLogger(Application.class); + public static void main(String[] args) { SpringApplication.run(Application.class, args); } @@ -33,6 +38,29 @@ public class Application { source.registerCorsConfiguration("/**", configuration); return source; } + + + @PreDestroy + public void closeThreads() { + logger.info("Shutting down the threads.."); + UrlController.insertsExecutor.shutdown(); // Define that no new tasks will be scheduled. + try { + if ( ! UrlController.insertsExecutor.awaitTermination(1, TimeUnit.MINUTES) ) { + logger.warn("The working threads did not finish on time! Stopping them immediately.."); + UrlController.insertsExecutor.shutdownNow(); + } + } catch (SecurityException se) { + logger.error("Could not shutdown the threads in any way..!", se); + } catch (InterruptedException ie) { + try { + UrlController.insertsExecutor.shutdownNow(); + } catch (SecurityException se) { + logger.error("Could not shutdown the threads in any way..!", se); + } + } + } + + // // @Bean // public CommandLineRunner setServerBaseUrl(Environment environment) { diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 19e4cc6..eb47805 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -19,6 +19,10 @@ import javax.servlet.http.HttpServletRequest; import java.sql.*; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Pattern; @@ -212,6 +216,8 @@ public class UrlController { } + public static ExecutorService insertsExecutor = Executors.newFixedThreadPool(2); + @PostMapping("addWorkerReport") public ResponseEntity addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) { @@ -258,58 +264,84 @@ public class UrlController { // Store the workerReport into the database. We use "PreparedStatements" to do insertions, for security and valid SQL syntax reasons. String insertIntoPayloadBaseQuery = "INSERT INTO " + databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)"; + int[] payloadArgTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; + String insertIntoAttemptBaseQuery = "INSERT INTO " + databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; - String payloadErrorMsg = null; - int failedCount = 0; + int[] attemptArgTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; + + final AtomicInteger failedCount = new AtomicInteger(0); + + List> callableTasks = new ArrayList<>(2); + // One thread will handle the inserts to the "payload" table adn the other to the "attempt" table. This way there will be as little blocking as possible (from the part of Impala). + + callableTasks.add(() -> { // Handle inserts to the "payload" table. + for ( UrlReport urlReport : urlReports ) { + Payload payload = urlReport.getPayload(); + if ( payload == null ) { + logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments + "\n" + urlReport); + continue; + } + + try { + Long size = payload.getSize(); + Object[] args = new Object[] {payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(), + payload.getMime_type(), (size != null) ? String.valueOf(size) : null, payload.getHash(), + payload.getLocation(), payload.getProvenance()}; + + jdbcTemplate.update(insertIntoPayloadBaseQuery, args, payloadArgTypes); + } catch (Exception sqle) { + logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle); + failedCount.incrementAndGet(); + } + } + return null; + }); + + callableTasks.add(() -> { // Handle inserts to the "attempt" table. + for ( UrlReport urlReport : urlReports ) { + Payload payload = urlReport.getPayload(); + if ( payload == null ) { + logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments + "\n" + urlReport); + continue; + } + + Error error = urlReport.getError(); + if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop). + logger.warn("Error was \"null\" for \"urlReport\": " + urlReport + "\nSetting an empty object with \"null\" members."); + error = new Error(null, null); + } + + try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. + Object[] args = new Object[] {payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(), + urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()}; + + jdbcTemplate.update(insertIntoAttemptBaseQuery, args, attemptArgTypes); + } catch (Exception sqle) { + logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage()); + failedCount.incrementAndGet(); + } + } + return null; + }); ImpalaConnector.databaseLock.lock(); - // TODO - Think about handling this loop with multiple threads.. The Impala-server will handle the synchronization itself.. - for ( UrlReport urlReport : urlReports ) { - Payload payload = urlReport.getPayload(); - if ( payload == null ) { - logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments); - payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!"; - continue; - } + try { // Invoke all the tasks and wait for them to finish before moving to the next batch. + insertsExecutor.invokeAll(callableTasks); + } catch (InterruptedException ie) { // In this case, any unfinished tasks are cancelled. + logger.warn("The current thread was interrupted when waiting for the worker-threads to finish inserting into the tables: " + ie.getMessage()); + // TODO - This is a very rare case, but what should be done..? + } catch (Exception e) { + ImpalaConnector.databaseLock.unlock(); + String errorMsg = "Unexpected error when inserting into the \"payload\" and \"attempt\" tables in parallel! " + e.getMessage(); + logger.error(errorMsg, e); + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); + } - try { - Long size = payload.getSize(); - Object[] args = new Object[] { - payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(), - payload.getMime_type(), (size != null) ? String.valueOf(size) : null, payload.getHash(), - payload.getLocation(), payload.getProvenance()}; - int[] argTypes = new int[] { - Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, - Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; - - jdbcTemplate.update(insertIntoPayloadBaseQuery, args, argTypes); - } catch (Exception sqle) { - logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle); - } - - Error error = urlReport.getError(); - if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop). - logger.warn("Error was \"null\" for \"urlReport\": " + urlReport + "\nSetting an empty object with \"null\" members."); - error = new Error(null, null); - } - - try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. - Object[] args = new Object[] { - payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(), - urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()}; - int[] argTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; - - jdbcTemplate.update(insertIntoAttemptBaseQuery, args, argTypes); - } catch (Exception sqle) { - logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage()); - } - }//end for-loop - - if ( payloadErrorMsg != null ) - logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables, although " + payloadErrorMsg + " Going to merge the parquet files for those tables."); - else - logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables. Going to merge the parquet files for those tables."); + int failedQueries = failedCount.get(); + String failedQueriesMsg = failedQueries + " out of " + urlReports.size() + " failed to be processed!"; + logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables" + ((failedQueries > 0) ? (", although " + failedQueriesMsg) : ".") + + " Going to merge the parquet files for those tables."); String mergeErrorMsg = fileUtils.mergeParquetFiles("payload", "", null); if ( mergeErrorMsg != null ) { @@ -333,8 +365,9 @@ public class UrlController { } ImpalaConnector.databaseLock.unlock(); + logger.debug("Finished merging the database tables."); - return ResponseEntity.status(HttpStatus.OK).body(payloadErrorMsg); + return ResponseEntity.status(HttpStatus.OK).body(failedQueriesMsg); } diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 3936dea..476bf88 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -362,9 +362,7 @@ public class FileUtils { if ( j < (numFullTextsCurBatch -1) ) sb.append(","); } - String requestUrl = sb.toString(); - - return requestUrl; + return sb.toString(); } private final int bufferSize = 20971520; // 20 MB From b20611414494df44997ae7320a27314150ff4c87 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 4 Feb 2022 15:49:56 +0200 Subject: [PATCH 11/11] - Allow the user to build, push and run the App in Docker, straight though the "installAndRun.sh" script. - Re-add the logback-spring configuration. - Change the docker-app name. --- Dockerfile | 4 ++-- installAndRun.sh | 34 +++++++++++++++++++++++---- src/main/resources/logback-spring.xml | 33 ++++++++++++++++++++++++++ 3 files changed, 64 insertions(+), 7 deletions(-) create mode 100644 src/main/resources/logback-spring.xml diff --git a/Dockerfile b/Dockerfile index e567ddd..fc501d6 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM openjdk:8-jdk-alpine -COPY build/libs/*-SNAPSHOT.jar app.jar +COPY build/libs/*-SNAPSHOT.jar urls_controller.jar EXPOSE 1880 -ENTRYPOINT ["java","-jar","/app.jar", "--spring.config.location=file:///mnt/config/application.properties"] +ENTRYPOINT ["java","-jar","/urls_controller.jar", "--spring.config.location=file:///mnt/config/application.properties"] diff --git a/installAndRun.sh b/installAndRun.sh index 1970b42..28201e1 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -1,11 +1,20 @@ cd "${0%/*}" || (echo "Could not chdir to this script's dir!" && exit) # Change the working directory to the script's directory, when running from other location. justInstall=0 +shouldRunInDocker=0 if [[ $# -eq 1 ]]; then justInstall=$1 -elif [[ $# -gt 1 ]]; then - echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh "; exit 1 +elif [[ $# -eq 2 ]]; then + justInstall=$1 + shouldRunInDocker=$2 +elif [[ $# -gt 2 ]]; then + echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh "; exit 1 +fi + +if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then + echo -e "Cannot run in docker without re-building the project (just to be safe). Setting \"justInstall\" to < 0 >" + justInstall=0 fi gradleVersion="7.3.3" @@ -27,10 +36,25 @@ if [[ justInstall -eq 0 ]]; then #gradle tasks # For debugging installation #gradle -v # For debugging installation - gradle clean - gradle build + gradle clean build + + if [[ shouldRunInDocker -eq 1 ]]; then + echo "Give the username for the Docker Hub:" + read -r username + echo -e "\nBuilding docker image..\n" + sudo docker --version || (echo -e "Docker was not found!"; exit 9) + dockerImage=${username}"/urls_controller:latest" + sudo docker build -t "${dockerImage}" . + echo -e "\nPushing docker image.. (the account password is required)..\n" + (sudo docker login -u "${username}" && sudo docker push "${dockerImage}") || true + (sudo mkdir -p "$HOME"/tmp/config && sudo cp ./src/main/resources/application.properties "$HOME"/tmp/config) || true + sudo docker run -d --mount type=bind,source="$HOME"/tmp/config,target=/mnt/config -p 1880:1880 "${dockerImage}" && echo "The docker container started running." + # Run in "detached mode" (in the background). + fi else export PATH=/opt/gradle/gradle-${gradleVersion}/bin:$PATH # Make sure the gradle is still accessible (it usually isn't without the "export"). fi -gradle bootRun +if [[ shouldRunInDocker -ne 1 ]]; then + gradle bootRun +fi diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml new file mode 100644 index 0000000..86ba426 --- /dev/null +++ b/src/main/resources/logback-spring.xml @@ -0,0 +1,33 @@ + + + + logs/UrlsController.log + + + logs/UrlsController.%i.log.zip + 1 + 20 + + + + 50MB + + + + UTF-8 + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n + + + + + + UTF-8 + %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %highlight(%-5level) %cyan(%logger{36}.%M\(@%line\)) - %msg%n + + + + + + + + \ No newline at end of file