diff --git a/Dockerfile b/Dockerfile index 9973be8..7b57b1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,7 +2,7 @@ FROM cassandra:4.1.3 # Install gettext to use envsubst -RUN apt-get update && apt-get install -y iputils-ping less locate gettext-base +# RUN apt-get update && apt-get install -y iputils-ping less locate gettext-base # Environment variables to configure Cassandra ENV CASSANDRA_CLUSTER_NAME=TestCluster @@ -20,19 +20,19 @@ ENV CASSANDRA_RACK=RAC1 # Copy cassandra.yaml and cassandra-rackdc.properties COPY cassandra.yaml /etc/cassandra/ -COPY cassandra-rackdc.properties /etc/cassandra/ +# COPY cassandra-rackdc.properties /etc/cassandra/ # Substitute environment variables in cassandra.yaml -RUN envsubst < /etc/cassandra/cassandra.yaml > /etc/cassandra/cassandra.yaml.tmp && mv /etc/cassandra/cassandra.yaml.tmp /etc/cassandra/cassandra.yaml +# RUN envsubst < /etc/cassandra/cassandra.yaml > /etc/cassandra/cassandra.yaml.tmp && mv /etc/cassandra/cassandra.yaml.tmp /etc/cassandra/cassandra.yaml # Substitute environment variables in cassandra-rackdc.properties -RUN envsubst < /etc/cassandra/cassandra-rackdc.properties > /etc/cassandra/cassandra-rackdc.properties.tmp && mv /etc/cassandra/cassandra-rackdc.properties.tmp /etc/cassandra/cassandra-rackdc.properties +# RUN envsubst < /etc/cassandra/cassandra-rackdc.properties > /etc/cassandra/cassandra-rackdc.properties.tmp && mv /etc/cassandra/cassandra-rackdc.properties.tmp /etc/cassandra/cassandra-rackdc.properties # COPY scripts/setup.sh /setup.sh # VOLUME "/scripts/setup.sh" # Set the entrypoint -ENTRYPOINT ["/scripts/setup.sh"] +# ENTRYPOINT ["/scripts/setup.sh"] # Expose Cassandra ports EXPOSE 7000 7001 7199 9042 9160 diff --git a/README.md b/README.md index 0612be7..66802f6 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,10 @@ in dev environment, cassandra has been installed manually and nodetool is not in * dev should be aliegned to prod +https://medium.com/@kayvan.sol2/deploying-apache-cassandra-cluster-3-nodes-with-docker-compose-3634ef8345e8 -# DOCKER + +## DOCKER instructions aggiungere a /etc/host l'entry: @@ -23,13 +25,26 @@ force recreate docker image `docker compose build --no-cache` check status: -`docker exec -it cassandra-cassandra-1 nodetool status` +`docker exec -it cassandra-1 nodetool status` +## import db +`./dump.sh` + +`docker compose up --build` + +attendere che tutto sia pronto e i db siano sincronizzati + +TODO: definire esattamente come, tendenzialmente con nodetool status , nodetool gossip, etc + +eseguire uno dopo l'altro, quando sono terminati +* cassandra1: `docker exec -it cassandra1 /scripts/setup` +* cassandra2: `docker exec -it cassandra2 /scripts/setup` +* cassandra3: `docker exec -it cassandra3 /scripts/setup` run a single service: -* cassandra1: `docker-compose up cassandra-cassandra1 --build` -* cassandra2: `docker-compose up cassandra-cassandra2 --build` +* cassandra1: `docker-compose up cassandra1 --build` +* cassandra2: `docker-compose up cassandra2 --build` * ,... open bash on server @@ -38,4 +53,27 @@ open bash on server * cassandra2: `docker exec -it cassandra2 /bin/bash` * cassandra3: `docker exec -it cassandra3 /bin/bash` +## check cassandra status +Check status +`nodetool status` + + + +Check if the Gossip protocol is enabled +`nodetool info | grep -i gossip` + +Check the status of the Gossip protocol +`nodetool gossipinfo` + +Check the communication between nodes +`nodetool ring` + + + + + +## Documentation + +* [cassandra dump data](docs/dump.md) +* [cassandra setup and import](docs/setup.md) diff --git a/cassandra.yaml b/cassandra.yaml index c885848..1f6fb62 100644 --- a/cassandra.yaml +++ b/cassandra.yaml @@ -1,11 +1,11 @@ -# Cluster name -cluster_name: ${CASSANDRA_CLUSTER_NAME} +# # Cluster name +# cluster_name: ${CASSANDRA_CLUSTER_NAME} -# Addresses -listen_address: ${CASSANDRA_LISTEN_ADDRESS} -broadcast_address: ${CASSANDRA_BROADCAST_ADDRESS} +# # Addresses +# listen_address: ${CASSANDRA_LISTEN_ADDRESS} +# broadcast_address: ${CASSANDRA_BROADCAST_ADDRESS} # rpc_address: 0.0.0.0 -broadcast_rpc_address: ${CASSANDRA_RPC_ADDRESS} +# broadcast_rpc_address: ${CASSANDRA_RPC_ADDRESS} # Seed nodes seed_provider: @@ -21,12 +21,12 @@ commitlog_directory: /var/lib/cassandra/commitlog saved_caches_directory: /var/lib/cassandra/saved_caches -client_encryption_options: - enabled: false - optional: false +# client_encryption_options: +# enabled: false +# optional: false # Tokens and allocation -num_tokens: ${CASSANDRA_NUM_TOKENS} +# num_tokens: ${CASSANDRA_NUM_TOKENS} allocate_tokens_for_local_replication_factor: 3 # Hinted handoff settings @@ -122,7 +122,8 @@ request_timeout: 1000000ms slow_query_log_timeout: 500ms # Snitch settings -endpoint_snitch: GossipingPropertyFileSnitch +# endpoint_snitch: GossipingPropertyFileSnitch +endpoint_snitch: SimpleSnitch dynamic_snitch_update_interval: 100ms dynamic_snitch_reset_interval: 600000ms dynamic_snitch_badness_threshold: 1.0 diff --git a/docker-compose.yml b/docker-compose.yml index 462405c..a609370 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,3 @@ -version: '3.8' - services: cassandra1: build: . @@ -18,11 +16,20 @@ services: - ./data/dumps/node1:/dump/snapshot - ./data/dumps/schema:/dump/schema - ./data/volumes/node1:/var/lib/cassandra - - ./logs/node1:/var/log/cassandra + - ./data/logs/node1:/var/log/cassandra + healthcheck: + test: ["CMD-SHELL", "nodetool status"] + interval: 2m + start_period: 2m + timeout: 10s + retries: 3 ports: - "9042:9042" networks: - cassandra-net + restart: + on-failure + cassandra2: build: . @@ -36,16 +43,25 @@ services: - CASSANDRA_DC=DC1 - CASSANDRA_RACK=RAC1 - PRIMARY_NODE=false + healthcheck: + test: ["CMD-SHELL", "nodetool status"] + interval: 2m + start_period: 2m + timeout: 10s + retries: 3 volumes: - ./scripts:/scripts - ./data/dumps/node2:/dump/snapshot - ./data/dumps/schema:/dump/schema - ./data/volumes/node2:/var/lib/cassandra - - ./logs/node2:/var/log/cassandra + - ./data/logs/node2:/var/log/cassandra networks: - cassandra-net depends_on: - - cassandra1 + cassandra1: + condition: service_healthy + restart: + on-failure cassandra3: build: . @@ -59,16 +75,27 @@ services: - CASSANDRA_DC=DC1 - CASSANDRA_RACK=RAC1 - PRIMARY_NODE=false + healthcheck: + test: ["CMD-SHELL", "nodetool status"] + interval: 2m + start_period: 2m + timeout: 10s + retries: 3 volumes: - ./scripts:/scripts - ./data/dumps/node3:/dump/snapshot - ./data/dumps/schema:/dump/schema - ./data/volumes/node3:/var/lib/cassandra - - ./logs/node3:/var/log/cassandra + - ./data/logs/node3:/var/log/cassandra networks: - cassandra-net depends_on: - - cassandra2 + cassandra2: + condition: service_healthy + restart: + on-failure + + networks: cassandra-net: diff --git a/docs/dump.md b/docs/dump.md new file mode 100644 index 0000000..1d403ce --- /dev/null +++ b/docs/dump.md @@ -0,0 +1,121 @@ +# Documentation: Exporting Data from Existing Cassandra Cluster + +This process exports data from an existing Cassandra cluster by creating snapshots on each node and copying the data to a local directory. + +The steps ensure a consistent and reliable backup of the keyspace data. + +The snapshot creation and data synchronization steps are executed in parallel for all nodes to speed up the process and ensure consistency. + +## Dump Process + +The data dump process involves taking a snapshot of the keyspace from each Cassandra node, copying the snapshots locally, and exporting the keyspace schema. This process is performed in parallel for efficiency. + +1. **Clear Old Snapshots:** + - For each node, remove any existing snapshots with the specified tag to ensure a clean state. + +2. **Create New Snapshots:** + - For each node, create a new snapshot with the specified tag. + +3. **Synchronize Snapshots Locally:** + - Copy the snapshot data from each node to the local directory. Each table's data is copied into a directory named after the table. + +4. **Export Keyspace Schema:** + - Export the keyspace schema from the first node and save it locally. + +### Directory Structure on Server +- Each table in the keyspace has its own directory. +- Inside each table's directory, there is a `snapshots` directory. +- The `snapshots` directory contains subdirectories for each snapshot, named according to the snapshot tag. + +### Local Directory Structure +- The local directory mirrors the server's structure. +- Each table's snapshot data is stored in a directory named after the table, inside the local dump directory. + +By following this process, a consistent and reliable backup of the Cassandra keyspace data is achieved, ensuring that the data can be restored or migrated as needed. + +## Directory Structure Example + +### Server-Side Structure + +On the server, the directory structure for the snapshots is organized as follows: + +```plaintext +/data +└── dev_keyspace_1 + ├── table1-abc1234567890abcdef1234567890abcdef + │ └── snapshots + │ └── dump_docker + │ ├── manifest.json + │ ├── nb-1-big-CompressionInfo.db + │ ├── nb-1-big-Data.db + │ ├── nb-1-big-Digest.crc32 + │ ├── nb-1-big-Filter.db + │ ├── nb-1-big-Index.db + │ ├── nb-1-big-Statistics.db + │ ├── nb-1-big-Summary.db + │ └── schema.cql + ├── table2-def4567890abcdef1234567890abcdef + │ └── snapshots + │ └── dump_docker + │ ├── manifest.json + │ ├── nb-1-big-CompressionInfo.db + │ ├── nb-1-big-Data.db + │ ├── nb-1-big-Digest.crc32 + │ ├── nb-1-big-Filter.db + │ ├── nb-1-big-Index.db + │ ├── nb-1-big-Statistics.db + │ ├── nb-1-big-Summary.db + │ └── schema.cql + └── table3-ghi7890abcdef1234567890abcdef + └── snapshots + └── dump_docker + ├── manifest.json + ├── nb-1-big-CompressionInfo.db + ├── nb-1-big-Data.db + ├── nb-1-big-Digest.crc32 + ├── nb-1-big-Filter.db + ├── nb-1-big-Index.db + ├── nb-1-big-Statistics.db + ├── nb-1-big-Summary.db + └── schema.cql +``` + +#### Local Directory Structure +When copied locally, the directory structure is organized as follows: + +```plaintext +data/dumps +├──schema +│ ├── dev_keyspace_1_schema.cql +└── node1 + ├── table1 + │ ├── manifest.json + │ ├── nb-1-big-CompressionInfo.db + │ ├── nb-1-big-Data.db + │ ├── nb-1-big-Digest.crc32 + │ ├── nb-1-big-Filter.db + │ ├── nb-1-big-Index.db + │ ├── nb-1-big-Statistics.db + │ ├── nb-1-big-Summary.db + │ └── schema.cql + ├── table2 + │ ├── manifest.json + │ ├── nb-1-big-CompressionInfo.db + │ ├── nb-1-big-Data.db + │ ├── nb-1-big-Digest.crc32 + │ ├── nb-1-big-Filter.db + │ ├── nb-1-big-Index.db + │ ├── nb-1-big-Statistics.db + │ ├── nb-1-big-Summary.db + │ └── schema.cql + └── table3 + ├── manifest.json + ├── nb-1-big-CompressionInfo.db + ├── nb-1-big-Data.db + ├── nb-1-big-Digest.crc32 + ├── nb-1-big-Filter.db + ├── nb-1-big-Index.db + ├── nb-1-big-Statistics.db + ├── nb-1-big-Summary.db + └── schema.cql +``` \ No newline at end of file diff --git a/docs/setup.md b/docs/setup.md new file mode 100644 index 0000000..f0473fc --- /dev/null +++ b/docs/setup.md @@ -0,0 +1,35 @@ + +### Cassandra Cluster Setup and Data Migration Workflow + +Workflow for setting up a Cassandra cluster with multiple nodes, creating keyspaces and schemas, and exporting and reimporting data. The process ensures synchronization across nodes and efficient data migration using snapshots. + +#### Workflow Phases +The workflow is divided into the following phases: +1. **Startup Phase**: All nodes start Cassandra and ensure they are ready to accept connections. +2. **Schema Creation Phase**: The primary node creates the keyspace and schema if they do not exist. This schema is then propagated to other nodes. +3. **Data Import Phase**: Data is imported from snapshots using `sstableloader` only if the schema was newly created. + +#### Phase 1: Startup Phase +Each node starts Cassandra and waits for it to be ready before proceeding to the next phase. + +- **Primary Node**: Starts Cassandra and waits for other nodes to signal they are ready. +- **Non-Primary Nodes**: Wait for the primary node to be ready before starting Cassandra. + +#### Phase 2: Schema Creation Phase +After all nodes are confirmed to be ready, the primary node checks if the keyspace exists and creates it if it does not. + +- **Primary Node**: + - Checks if the keyspace exists. + - If the keyspace does not exist, creates the keyspace and applies the schema. + - Waits for the schema to propagate to all nodes. +- **Non-Primary Nodes**: + - Wait for the primary node to complete schema creation and propagation. + +#### Phase 3: Data Import Phase +Data is imported into the keyspace using `sstableloader` from the snapshots if the schema was newly created. + +- **Primary Node**: + - If the schema was created, imports data from the snapshots. +- **Non-Primary Nodes**: + - Wait for the primary node to complete the data import. + diff --git a/scripts/import.sh b/scripts/import.sh new file mode 100755 index 0000000..cbf9b0d --- /dev/null +++ b/scripts/import.sh @@ -0,0 +1,19 @@ +#!/bin/bash + + +SNAPSHOT_DIR="$DUMP_DIR/snapshot" +IP_ADDRESS=$(hostname -I | awk '{print $1}') + +# Define a logging function +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/import.log +} + +log "Importing snapshots using sstableloader..." +for TABLE_DIR in $(ls $SNAPSHOT_DIR); do + TABLE_NAME=$(basename $TABLE_DIR) # Extract table name from directory name + log "Importing table: $TABLE_NAME from directory: $SNAPSHOT_DIR/$TABLE_DIR" + sstableloader -d "$CASSANDRA_SEEDS" -v -k "$KEYSPACE" "$SNAPSHOT_DIR/$TABLE_DIR" + cqlsh $IP_ADDRESS -k "$KEYSPACE" -e "select count(*) from $TABLE_NAME;" >&2 +done diff --git a/scripts/is_cassandra_ready.sh b/scripts/is_cassandra_ready.sh new file mode 100755 index 0000000..9f5a4cb --- /dev/null +++ b/scripts/is_cassandra_ready.sh @@ -0,0 +1,23 @@ +#!/bin/bash +IP_ADDRESS=$(hostname -I | awk '{print $1}') + +# Define a logging function +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/is_cassandra_ready.log +} + +log "Checking if Cassandra is ready..." + +is_cassandra_ready() { + cqlsh $IP_ADDRESS -e 'SHOW HOST' > /dev/null 2>&1 +} + +is_cassandra_ready +if [ $? -eq 0 ]; then + log "Cassandra is ready." + exit 0 +else + log "Cassandra is not ready." + exit 1 +fi diff --git a/scripts/is_keyspace_exists.sh b/scripts/is_keyspace_exists.sh new file mode 100755 index 0000000..6e43516 --- /dev/null +++ b/scripts/is_keyspace_exists.sh @@ -0,0 +1,35 @@ +#!/bin/bash + +# Usage: is_keyspace_exists.sh [--keyspace ] +# Example: is_keyspace_exists.sh --keyspace dev_keyspace_1 + +KEYSPACE=${KEYSPACE:-} + +# Parse arguments +while [ $# -gt 0 ]; do + case "$1" in + --keyspace) + KEYSPACE="$2" + shift 2 + ;; + *) + echo "Unknown argument: $1" + exit 1 + ;; + esac +done + +# Check for required arguments or environment variables +if [ -z "$KEYSPACE" ]; then + echo "KEYSPACE is not set. Set it via --keyspace or KEYSPACE environment variable." + exit 1 +fi + +IP_ADDRESS=$(hostname -I | awk '{print $1}') + +if cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then + echo "Keyspace $KEYSPACE EXISTS" + exit 0 +fi +echo "Keyspace $KEYSPACE DOES NOT EXIST" +exit 1 diff --git a/scripts/is_node_up.sh b/scripts/is_node_up.sh new file mode 100755 index 0000000..024d250 --- /dev/null +++ b/scripts/is_node_up.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# Define a logging function +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/is_node_up.log +} + +# Default values +NODE=${1:-$(hostname -I | awk '{print $1}')} +CASSANDRA_RPC_ADDRESS=${2:-$CASSANDRA_RPC_ADDRESS} + +log "Checking if node $NODE is up..." + +is_node_up() { + local NODE="$1" + local NODE_STATUS=$(nodetool status -r) + if echo "$NODE_STATUS" | grep -E "^UN" | grep "$NODE" > /dev/null; then + return 0 + elif [ "$NODE" = "$CASSANDRA_RPC_ADDRESS" ]; then + NODE_STATUS=$(nodetool status) + if echo "$NODE_STATUS" | grep -E "^UN.*$(hostname -I | awk '{print $1}')" > /dev/null; then + return 0 + fi + fi + return 1 +} + +is_node_up $NODE +if [ $? -eq 0 ]; then + log "Node $NODE is up." + exit 0 +else + log "Node $NODE is not up." + exit 1 +fi diff --git a/scripts/is_primary_node.sh b/scripts/is_primary_node.sh new file mode 100755 index 0000000..ae8e215 --- /dev/null +++ b/scripts/is_primary_node.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# PRIMARY_NODE=${PRIMARY_NODE:-} +echo PRIMARY_NODE: $PRIMARY_NODE + +if [ "$PRIMARY_NODE" = "true" ]; then + exit 0 + +else + exit 1 +fi diff --git a/scripts/is_schema_agreed.sh b/scripts/is_schema_agreed.sh new file mode 100755 index 0000000..0e4bf0a --- /dev/null +++ b/scripts/is_schema_agreed.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +IP_ADDRESS=$(hostname -I | awk '{print $1}') +SEEDS=(${CASSANDRA_SEEDS//,/ }) + + +# Define a logging function +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/is_schema_agreed.log +} + +log "Checking if schema is agreed..." + +is_schema_agreed() { + if cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then + SCHEMA_NODES=$(nodetool describecluster | grep -A 1 "Schema versions:" | grep -o '\[.*\]' | tr -d '[]' | tr ',' '\n' | wc -l) + if [ "$SCHEMA_NODES" -eq "${#SEEDS[@]}" ]; then + return 0 + fi + fi + return 1 +} + +is_schema_agreed +if [ $? -eq 0 ]; then + log "Schema is agreed." + exit 0 +else + log "Schema is not agreed." + exit 1 +fi diff --git a/scripts/setup.sh b/scripts/setup.sh index 8d1185d..d0a54eb 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -8,87 +8,111 @@ log() { log "RUNNING SETUP" +# Configuration KEYSPACE="dev_keyspace_1" DUMP_DIR="/dump" # Ensure DUMP_DIR is defined -SCHEMA_PATH="$DUMP_DIR/schema/${KEYSPACE}_schema.cql" # Ensure DUMP_DIR is defined +SCHEMA_PATH="$DUMP_DIR/schema/${KEYSPACE}_schema.cql" CASSANDRA_SEEDS="cassandra1,cassandra2,cassandra3" -STATUS_DIR="/var/log/cassandra" IP_ADDRESS=$(hostname -I | awk '{print $1}') +DATA_DIR="/var/lib/cassandra/data/$KEYSPACE" +SNAPSHOT_DIR="$DUMP_DIR/snapshot" +PRIMARY_NODE=${PRIMARY_NODE:-false} # Default to false if not set +SLEEP_DURATION=5 # Sleep duration in seconds for waits +TIMEOUT=3000 # Timeout in seconds for waits # Initialize SEEDS array SEEDS=(${CASSANDRA_SEEDS//,/ }) -# Function to wait for all nodes to be in the 'UN' state -wait_for_all_nodes_up() { - SEEDS=(${CASSANDRA_SEEDS//,/ }) +# Function to wait for a command to succeed +wait_for_command() { + local COMMAND="$1" + local TIMEOUT="$2" + local START_TIME=$(date +%s) + local END_TIME=$((START_TIME + TIMEOUT)) + while true; do - all_up=true - for seed in "${SEEDS[@]}"; do - NODE_STATUS=$(nodetool status -r) - if ! echo "$NODE_STATUS" | grep -E "^UN.*$seed" > /dev/null; then - if [ "$seed" = "$CASSANDRA_RPC_ADDRESS" ]; then - NODE_STATUS=$(nodetool status) - if ! echo "$NODE_STATUS" | grep -E "^UN.*$(hostname -I | awk '{print $1}')" > /dev/null; then - log "Node $seed (self) is not up yet..." - all_up=false - break - fi - else - log "Node $seed is not up yet..." - all_up=false - break - fi - fi - done - if [ "$all_up" = true ]; then - log "All nodes are up." + if eval "$COMMAND"; then + log "Command succeeded: $COMMAND" break else - sleep 5 + local CURRENT_TIME=$(date +%s) + if [ "$CURRENT_TIME" -ge "$END_TIME" ]; then + log "Timed out waiting for command: $COMMAND" + exit 1 + fi + + log "Command failed: $COMMAND, still waiting" + sleep $SLEEP_DURATION fi done } -# Function to wait for schema agreement across all nodes -wait_for_schema_agreement() { - while true; do - if cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then - if nodetool describecluster | grep -q "Schema versions:"; then - SCHEMA_COUNT=$(nodetool describecluster | grep -A 1 "Schema versions:" | wc -l) - if [ "$SCHEMA_COUNT" -eq 2 ]; then - log "Schema agreement reached." - break - else - log "Waiting for schema agreement..." - fi - fi - else - log "Waiting for keyspace $KEYSPACE to be available..." +# Function to check if a node is up +is_node_up() { + local NODE="$1" + local NODE_STATUS=$(nodetool status -r) + if echo "$NODE_STATUS" | grep -E "^UN" | grep "$NODE" > /dev/null; then + return 0 + elif [ "$NODE" = "$CASSANDRA_RPC_ADDRESS" ]; then + NODE_STATUS=$(nodetool status) + if echo "$NODE_STATUS" | grep -E "^UN.*$(hostname -I | awk '{print $1}')" > /dev/null; then + return 0 fi - sleep 5 + fi + return 1 +} + +# Function to wait for all nodes to be up +wait_for_all_nodes_up() { + for seed in "${SEEDS[@]}"; do + wait_for_command "is_node_up $seed" $TIMEOUT done + log "All nodes are up." +} + +# Function to check for schema agreement and if schema exists +is_schema_agreed() { + if cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then + SCHEMA_NODES=$(nodetool describecluster | grep -A 1 "Schema versions:" | grep -o '\[.*\]' | tr -d '[]' | tr ',' '\n' | wc -l) + if [ "$SCHEMA_NODES" -eq "${#SEEDS[@]}" ]; then + return 0 + fi + fi + return 1 +} + +# Function to check if keyspace exists +is_keyspace_exists() { + if cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then + return 0 + fi + return 1 } log "setup KEYSPACE: $KEYSPACE" log "setup DUMP_DIR: $DUMP_DIR" log "setup SCHEMA_PATH: $SCHEMA_PATH" log "setup CASSANDRA_SEEDS: $CASSANDRA_SEEDS" -log "setup STATUS_DIR: $STATUS_DIR" + +# Check if the keyspace directory exists and is not empty +if [ -d "$DATA_DIR" ] && [ "$(ls -A $DATA_DIR)" ]; then + log "Data directory $DATA_DIR exists and is not empty. Skipping schema creation and data import." + SCHEMA_CREATED=false +else + log "Data directory $DATA_DIR does not exist or is empty. Proceeding with schema creation and data import." + SCHEMA_CREATED=true +fi # Wait for cassandra1 to be ready if this is not the primary node if [ "$PRIMARY_NODE" != "true" ]; then - log "Waiting for cassandra1 to be ready..." - /wait-for-it.sh cassandra1:9042 -t 60 -- log "cassandra1 is ready" + wait_for_service cassandra1 9042 $TIMEOUT fi # Start Cassandra in the background -cassandra -R & +# cassandra -R & # Wait for Cassandra to be ready -log "Waiting for Cassandra to start..." -until cqlsh $IP_ADDRESS -e "SHOW HOST" > /dev/null 2>&1; do - sleep 2 -done +wait_for_command "cqlsh $IP_ADDRESS -e 'SHOW HOST' > /dev/null 2>&1" $TIMEOUT # Log the value of PRIMARY_NODE for debugging log "PRIMARY_NODE is set to: $PRIMARY_NODE" @@ -100,42 +124,30 @@ wait_for_all_nodes_up # Step 2: Create keyspace and schema on the primary node if [ "$PRIMARY_NODE" = "true" ]; then log "Checking if keyspace $KEYSPACE exists..." - if ! cqlsh $IP_ADDRESS -e "DESCRIBE KEYSPACE $KEYSPACE;" > /dev/null 2>&1; then + if ! is_keyspace_exists; then log "Keyspace $KEYSPACE does not exist. Creating keyspace and tables..." cqlsh $IP_ADDRESS -f "$SCHEMA_PATH" else log "Keyspace $KEYSPACE already exists. Ensuring tables exist..." fi - - # Signal to secondary nodes that schema creation is complete - touch $STATUS_DIR/schema_created fi # Step 3: Wait for schema to be created and agreed upon across all nodes log "Waiting for schema agreement across all nodes..." -wait_for_schema_agreement +wait_for_command "is_schema_agreed" $TIMEOUT # Step 4: Import data using sstableloader if not previously imported -if [ "$PRIMARY_NODE" = "true" ]; then +# if [ "$SCHEMA_CREATED" = true ]; then log "Importing snapshots using sstableloader..." - for TABLE_DIR in $(ls $DUMP_DIR); do + for TABLE_DIR in $(ls $SNAPSHOT_DIR); do TABLE_NAME=$(basename $TABLE_DIR) # Extract table name from directory name - log "Importing table: $TABLE_NAME from directory: $DUMP_DIR/$TABLE_DIR" - sstableloader -d "$CASSANDRA_SEEDS" -v -k "$KEYSPACE" "$DUMP_DIR/$TABLE_DIR" + log "Importing table: $TABLE_NAME from directory: $SNAPSHOT_DIR/$TABLE_DIR" + sstableloader -d "$CASSANDRA_SEEDS" -v -k "$KEYSPACE" "$SNAPSHOT_DIR/$TABLE_DIR" cqlsh $IP_ADDRESS -k "$KEYSPACE" -e "select count(*) from $TABLE_NAME;" >&2 done - - # Signal to secondary nodes that import is complete - touch $STATUS_DIR/import_complete -else - # Wait for import completion signal from primary node - log "Waiting for import completion signal from primary node..." - while [ ! -f "$STATUS_DIR/import_complete" ]; do - sleep 5 - done -fi +# fi log "FINISHED IMPORT" # Keep the container running -tail -f /dev/null +# tail -f /dev/null diff --git a/scripts/setup_empty.sh b/scripts/setup_empty.sh new file mode 100755 index 0000000..53fa45a --- /dev/null +++ b/scripts/setup_empty.sh @@ -0,0 +1,5 @@ +#!/bin/bash + + +# Keep the container running +tail -f /dev/null diff --git a/scripts/setup_orig.sh b/scripts/setup_orig.sh new file mode 100755 index 0000000..3283ec3 --- /dev/null +++ b/scripts/setup_orig.sh @@ -0,0 +1,120 @@ +#!/bin/bash + +# Define a logging function +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/setup.log +} + +log "RUNNING SETUP" + +# Configuration +KEYSPACE=${KEYSPACE:-dev_keyspace_1} +DUMP_DIR=${DUMP_DIR:-/dump} # Ensure DUMP_DIR is defined +CASSANDRA_SEEDS=${CASSANDRA_SEEDS:-cassandra1,cassandra2,cassandra3} +PRIMARY_NODE=${PRIMARY_NODE:-false} # Default to false if not set + +IP_ADDRESS=$(hostname -I | awk '{print $1}') + +SCHEMA_PATH="$DUMP_DIR/schema/${KEYSPACE}_schema.cql" +DATA_DIR="/var/lib/cassandra/data/$KEYSPACE" +SNAPSHOT_DIR="$DUMP_DIR/snapshot" + +# Initialize SEEDS array +SEEDS=(${CASSANDRA_SEEDS//,/ }) + +SLEEP_DURATION=5 # Sleep duration in seconds for waits +TIMEOUT=300 # Timeout in seconds for waits + +# Function to wait for a command to succeed +wait_for_command() { + local COMMAND="$1" + local TIMEOUT="$2" + local START_TIME=$(date +%s) + local END_TIME=$((START_TIME + TIMEOUT)) + + while true; do + if eval "$COMMAND"; then + log "Command succeeded: $COMMAND" + break + else + local CURRENT_TIME=$(date +%s) + if [ "$CURRENT_TIME" -ge "$END_TIME" ]; then + log "Timed out waiting for command: $COMMAND" + exit 1 + fi + + log "Command failed: $COMMAND, still waiting" + sleep $SLEEP_DURATION + fi + done +} + +log "setup KEYSPACE: $KEYSPACE" +log "setup DUMP_DIR: $DUMP_DIR" +log "setup SCHEMA_PATH: $SCHEMA_PATH" +log "setup CASSANDRA_SEEDS: $CASSANDRA_SEEDS" + +# Check if the keyspace directory exists and is not empty +if [ -d "$DATA_DIR" ] && [ "$(ls -A $DATA_DIR)" ]; then + log "Data directory $DATA_DIR exists and is not empty. Skipping schema creation and data import." + EMPTY_DB=false +else + log "Data directory $DATA_DIR does not exist or is empty. Proceeding with schema creation and data import." + EMPTY_DB=true +fi + +# # Wait for cassandra1 to be ready if this is not the primary node +# if [ "$PRIMARY_NODE" != "true" ]; then +# wait_for_command "/scripts/is_node_up.sh --node cassandra1 --cassandra_rpc_address $IP_ADDRESS" $TIMEOUT +# fi + +# Start Cassandra in the background +cassandra -R & + +# Wait for Cassandra to be ready +wait_for_command "/scripts/is_cassandra_ready.sh" $TIMEOUT + +# Log the value of PRIMARY_NODE for debugging +log "PRIMARY_NODE is set to: $PRIMARY_NODE" + +# Step 1: Wait for all nodes to be up and ready +log "Waiting for all nodes to be up and ready..." +wait_for_command "/scripts/is_node_up.sh --node $seed " $TIMEOUT +// TODO: aspettare tutti i nodi + +# Function to wait for all nodes to be up +wait_for_all_nodes_up() { + for seed in "${SEEDS[@]}"; do + wait_for_command "/scripts/is_node_up.sh --node $seed " $TIMEOUT + done + log "All nodes are up." +} + +wait_for_all_nodes_up + +# Step 2: Create keyspace and schema on the primary node +if [ "$PRIMARY_NODE" = "true" ]; then + log "Checking if keyspace $KEYSPACE exists..." + if ! /scripts/is_keyspace_exists.sh --keyspace "$KEYSPACE"; then + log "Keyspace $KEYSPACE does not exist. Creating keyspace and tables..." + cqlsh $IP_ADDRESS -f "$SCHEMA_PATH" + else + log "Keyspace $KEYSPACE already exists. Ensuring tables exist..." + fi +fi + +# Step 3: Wait for schema to be created and agreed upon across all nodes +log "Waiting for schema agreement across all nodes..." +wait_for_command "/scripts/is_schema_agreed.sh --keyspace $KEYSPACE --cassandra_seeds $CASSANDRA_SEEDS" $TIMEOUT + +# Step 4: Import data using sstableloader if not previously imported +if [ "$EMPTY_DB" = true ]; then + log "Importing snapshots using sstableloader..." + /scripts/import.sh --keyspace "$KEYSPACE" --dump_dir "$SNAPSHOT_DIR" --cassandra_seeds "$CASSANDRA_SEEDS" +fi + +log "FINISHED IMPORT" + +# Keep the container running +tail -f /dev/null diff --git a/scripts/wait_for_command.sh b/scripts/wait_for_command.sh new file mode 100755 index 0000000..c05bc39 --- /dev/null +++ b/scripts/wait_for_command.sh @@ -0,0 +1,31 @@ +#!/bin/bash + +COMMAND="$1" +TIMEOUT="$2" +SLEEP_DURATION=5 # Sleep duration in seconds for waits + +log() { + local MESSAGE="$1" + echo -e "$MESSAGE" | tee -a /var/log/cassandra/setup.log +} + +wait_for_command() { + local START_TIME=$(date +%s) + local END_TIME=$((START_TIME + TIMEOUT)) + + while true; do + if eval "$COMMAND"; then + log "Command succeeded: $COMMAND" + break + else + local CURRENT_TIME=$(date +%s) + if [ "$CURRENT_TIME" -ge "$END_TIME" ]; then + log "Timed out waiting for command: $COMMAND" + exit 1 + fi + sleep $SLEEP_DURATION + fi + done +} + +wait_for_command