From 0d502853db4c2818b3c4e1314d141711f01f5784 Mon Sep 17 00:00:00 2001 From: Carl Vitullo Date: Wed, 28 Jan 2026 10:56:14 -0500 Subject: [PATCH 1/2] Add scripts to help identify and recover from DB corruption --- scripts/db-backup.sh | 60 +++++++++++ scripts/db-deploy.sh | 169 +++++++++++++++++++++++++++++++ scripts/db-integrity.sh | 137 ++++++++++++++++++++++++++ scripts/db-rebuild.sh | 213 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 579 insertions(+) create mode 100755 scripts/db-backup.sh create mode 100755 scripts/db-deploy.sh create mode 100755 scripts/db-integrity.sh create mode 100755 scripts/db-rebuild.sh diff --git a/scripts/db-backup.sh b/scripts/db-backup.sh new file mode 100755 index 00000000..439f5b1c --- /dev/null +++ b/scripts/db-backup.sh @@ -0,0 +1,60 @@ +#!/bin/bash +# Backup the production database from Kubernetes pod to local destination. +# Usage: ./scripts/db-backup.sh [destination] +# +# Arguments: +# destination - Optional path for backup file. Defaults to timestamped file in current directory. + +set -euo pipefail + +POD_NAME="mod-bot-set-0" +REMOTE_DB_PATH="/data/mod-bot.sqlite3" + +# Determine destination path +if [[ $# -ge 1 ]]; then + DESTINATION="$1" +else + TIMESTAMP=$(date +"%Y-%m-%d_%H%M%S") + DESTINATION="./mod-bot-backup-${TIMESTAMP}.sqlite3" +fi + +echo "Backing up database from ${POD_NAME}..." +echo " Source: ${POD_NAME}:${REMOTE_DB_PATH}" +echo " Destination: ${DESTINATION}" +echo + +# Copy the database file from the pod +if ! kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}" "${DESTINATION}"; then + echo "Error: Failed to copy database from pod" >&2 + exit 1 +fi + +# Also copy WAL and SHM files if they exist (for complete backup) +echo "Checking for WAL files..." +if kubectl exec "${POD_NAME}" -- test -f "${REMOTE_DB_PATH}-wal" 2>/dev/null; then + echo " Copying WAL file..." + kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}-wal" "${DESTINATION}-wal" 2>/dev/null || true +fi + +if kubectl exec "${POD_NAME}" -- test -f "${REMOTE_DB_PATH}-shm" 2>/dev/null; then + echo " Copying SHM file..." + kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}-shm" "${DESTINATION}-shm" 2>/dev/null || true +fi + +# Show file size +FILE_SIZE=$(ls -lh "${DESTINATION}" | awk '{print $5}') +echo +echo "Backup complete!" +echo " File: ${DESTINATION}" +echo " Size: ${FILE_SIZE}" + +# Quick integrity check +echo +echo "Running quick integrity check..." +INTEGRITY=$(sqlite3 "${DESTINATION}" "PRAGMA quick_check;" 2>&1) +if [[ "${INTEGRITY}" == "ok" ]]; then + echo " Integrity: OK" +else + echo " Integrity: ISSUES DETECTED" + echo " Run ./scripts/db-integrity.sh ${DESTINATION} for details" +fi diff --git a/scripts/db-deploy.sh b/scripts/db-deploy.sh new file mode 100755 index 00000000..52305d90 --- /dev/null +++ b/scripts/db-deploy.sh @@ -0,0 +1,169 @@ +#!/bin/bash +# Deploy a repaired database to production using a temporary pod. +# +# Usage: ./scripts/db-deploy.sh +# +# This script will: +# 1. Verify local database integrity +# 2. Scale down the production StatefulSet +# 3. Create a temporary pod attached to the data volume +# 4. Copy the repaired database to the volume +# 5. Scale up the production StatefulSet +# 6. Delete the temporary pod + +set -euo pipefail + +STATEFULSET_NAME="mod-bot-set" +PVC_NAME="data-mod-bot-set-0" +REMOTE_DB_PATH="/data/mod-bot.sqlite3" +TEMP_POD_NAME="db-deploy-temp" +NAMESPACE="${NAMESPACE:-default}" + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " >&2 + echo "" >&2 + echo "Deploys a repaired database to production by:" >&2 + echo " 1. Scaling down the StatefulSet" >&2 + echo " 2. Mounting the volume in a temporary pod" >&2 + echo " 3. Copying the database" >&2 + echo " 4. Scaling up and cleaning up" >&2 + exit 1 +fi + +LOCAL_DB="$1" + +if [[ ! -f "${LOCAL_DB}" ]]; then + echo "Error: Database file not found: ${LOCAL_DB}" >&2 + exit 1 +fi + +cleanup() { + echo "" + echo "Cleaning up..." + kubectl delete pod "${TEMP_POD_NAME}" --ignore-not-found=true 2>/dev/null || true +} + +echo "Database Deployment" +echo "Source: ${LOCAL_DB}" +echo "Target: ${STATEFULSET_NAME} (${PVC_NAME})" +echo "Date: $(date)" +echo + +# Step 1: Verify local database integrity +echo "1. Verifying local database integrity" +INTEGRITY=$(sqlite3 "${LOCAL_DB}" "PRAGMA quick_check;" 2>&1) +if [[ "${INTEGRITY}" != "ok" ]]; then + echo "Error: Database failed integrity check" >&2 + echo "Details:" >&2 + echo "${INTEGRITY}" | head -10 >&2 + exit 1 +fi +LOCAL_SIZE=$(ls -lh "${LOCAL_DB}" | awk '{print $5}') +echo "Status: PASSED (${LOCAL_SIZE})" +echo + +# Confirm before proceeding +echo "This will:" +echo " - Scale down ${STATEFULSET_NAME} (production will be offline)" +echo " - Replace the database on ${PVC_NAME}" +echo " - Scale back up" +echo "" +read -p "Proceed? [y/N] " -n 1 -r +echo +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborted." + exit 1 +fi +echo + +# Step 2: Scale down the StatefulSet +echo "2. Scaling down ${STATEFULSET_NAME}" +CURRENT_REPLICAS=$(kubectl get statefulset "${STATEFULSET_NAME}" -o jsonpath='{.spec.replicas}') +echo "Current replicas: ${CURRENT_REPLICAS}" + +kubectl scale statefulset "${STATEFULSET_NAME}" --replicas=0 +echo "Waiting for pod to terminate..." +kubectl wait --for=delete pod/"${STATEFULSET_NAME}-0" --timeout=120s 2>/dev/null || true +echo "StatefulSet scaled down" +echo + +# Set up cleanup trap after scaling down +trap cleanup EXIT + +# Step 3: Create temporary pod +echo "3. Creating temporary pod" +cat </dev/null; then + EXISTING_SIZE=$(kubectl exec "${TEMP_POD_NAME}" -- ls -lh "${REMOTE_DB_PATH}" | awk '{print $5}') + echo "Existing database size: ${EXISTING_SIZE}" + + # Remove WAL/SHM files + kubectl exec "${TEMP_POD_NAME}" -- rm -f "${REMOTE_DB_PATH}-wal" "${REMOTE_DB_PATH}-shm" 2>/dev/null || true +else + echo "No existing database found" +fi + +echo "Uploading ${LOCAL_SIZE}..." +kubectl cp "${LOCAL_DB}" "${TEMP_POD_NAME}:${REMOTE_DB_PATH}" + +# Verify the copy +REMOTE_SIZE=$(kubectl exec "${TEMP_POD_NAME}" -- ls -lh "${REMOTE_DB_PATH}" | awk '{print $5}') +echo "Uploaded size: ${REMOTE_SIZE}" +echo + +# Step 5: Scale up StatefulSet +echo "5. Scaling up ${STATEFULSET_NAME}" +kubectl scale statefulset "${STATEFULSET_NAME}" --replicas="${CURRENT_REPLICAS}" +echo "Waiting for pod to be ready..." +kubectl wait --for=condition=Ready pod/"${STATEFULSET_NAME}-0" --timeout=300s +echo "StatefulSet scaled up" +echo + +# Step 6: Delete temporary pod (handled by trap, but do it explicitly) +echo "6. Cleaning up temporary pod" +trap - EXIT +kubectl delete pod "${TEMP_POD_NAME}" --wait=false +echo "Temporary pod deleted" +echo + +# Verify +echo "7. Verifying deployment" +sleep 2 # Give the app a moment to start +REMOTE_CHECK=$(kubectl exec "${STATEFULSET_NAME}-0" -- sqlite3 "${REMOTE_DB_PATH}" "PRAGMA quick_check;" 2>&1 || echo "ERROR") +if [[ "${REMOTE_CHECK}" == "ok" ]]; then + echo "Remote integrity: PASSED" +else + echo "Warning: Could not verify remote database" + echo "${REMOTE_CHECK}" +fi +echo + +echo "Deployment complete!" \ No newline at end of file diff --git a/scripts/db-integrity.sh b/scripts/db-integrity.sh new file mode 100755 index 00000000..9ddc0779 --- /dev/null +++ b/scripts/db-integrity.sh @@ -0,0 +1,137 @@ +#!/bin/bash +# Run integrity checks on a local SQLite database and display a summary. +# +# Usage: ./scripts/db-integrity.sh +# +# Performs: +# - PRAGMA integrity_check +# - PRAGMA foreign_key_check +# - REINDEX with constraint violation detection +# - Table row counts +# - Overall health status report + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 " >&2 + exit 1 +fi + +DB_FILE="$1" + +if [[ ! -f "${DB_FILE}" ]]; then + echo "Error: Database file not found: ${DB_FILE}" >&2 + exit 1 +fi + +echo "Database Integrity Report" +echo "File: ${DB_FILE}" +echo "Size: $(ls -lh "${DB_FILE}" | awk '{print $5}')" +echo "Date: $(date)" +echo + +ISSUES_FOUND=0 + +# Integrity check (use quick_check first, it's faster and less likely to crash on corrupt DBs) +echo "1. PRAGMA quick_check" +INTEGRITY_RESULT=$(sqlite3 "${DB_FILE}" "PRAGMA quick_check;" 2>&1) +if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then + echo "Status: PASSED" +else + echo "Status: FAILED" + echo "Details:" + echo "${INTEGRITY_RESULT}" | head -10 + if [[ $(echo "${INTEGRITY_RESULT}" | wc -l) -gt 10 ]]; then + echo "... (truncated, $(echo "${INTEGRITY_RESULT}" | wc -l) total issues)" + fi + ISSUES_FOUND=1 +fi +echo + +# Foreign key check +echo "2. PRAGMA foreign_key_check" +FK_RESULT=$(sqlite3 "${DB_FILE}" "PRAGMA foreign_key_check;" 2>&1) +if [[ -z "${FK_RESULT}" ]]; then + echo "Status: PASSED (no violations)" +else + echo "Status: FAILED" + echo "Violations found:" + echo "${FK_RESULT}" | head -10 + if [[ $(echo "${FK_RESULT}" | wc -l) -gt 10 ]]; then + echo "... (truncated)" + fi + ISSUES_FOUND=1 +fi +echo + +# REINDEX attempt (may fail on corrupt databases) +echo "3. REINDEX check" +if REINDEX_RESULT=$(sqlite3 "${DB_FILE}" "REINDEX;" 2>&1); then + if [[ -z "${REINDEX_RESULT}" ]]; then + echo "Status: PASSED" + else + echo "Status: ISSUES DETECTED" + echo "Details:" + echo "${REINDEX_RESULT}" | head -10 + if [[ $(echo "${REINDEX_RESULT}" | wc -l) -gt 10 ]]; then + echo "... (truncated)" + fi + ISSUES_FOUND=1 + fi +else + echo "Status: FAILED (sqlite3 crashed or errored)" + echo "This usually indicates severe corruption." + if [[ -n "${REINDEX_RESULT}" ]]; then + echo "Details:" + echo "${REINDEX_RESULT}" | head -10 + if [[ $(echo "${REINDEX_RESULT}" | wc -l) -gt 10 ]]; then + echo "... (truncated)" + fi + fi + ISSUES_FOUND=1 +fi +echo + +# Table row counts +echo "4. Table Statistics" + +TABLES=$(sqlite3 "${DB_FILE}" "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name;") +TOTAL_ROWS=0 +TABLE_DATA="Table,Rows" + +for TABLE in ${TABLES}; do + ROW_COUNT=$(sqlite3 "${DB_FILE}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "ERROR") + TABLE_DATA="${TABLE_DATA}"$'\n'"${TABLE},${ROW_COUNT}" + if [[ "${ROW_COUNT}" != "ERROR" ]]; then + TOTAL_ROWS=$((TOTAL_ROWS + ROW_COUNT)) + fi +done + +TABLE_DATA="${TABLE_DATA}"$'\n'"---,---" +TABLE_DATA="${TABLE_DATA}"$'\n'"TOTAL,${TOTAL_ROWS}" +echo "${TABLE_DATA}" | column -t -s ',' +echo + +# SQLite version and settings +echo "5. Database Configuration" +echo "SQLite version: $(sqlite3 "${DB_FILE}" "SELECT sqlite_version();")" +echo "Journal mode: $(sqlite3 "${DB_FILE}" "PRAGMA journal_mode;")" +echo "Page size: $(sqlite3 "${DB_FILE}" "PRAGMA page_size;")" +echo "Page count: $(sqlite3 "${DB_FILE}" "PRAGMA page_count;")" +echo "Freelist count: $(sqlite3 "${DB_FILE}" "PRAGMA freelist_count;")" +echo "Auto vacuum: $(sqlite3 "${DB_FILE}" "PRAGMA auto_vacuum;")" +echo + +# Overall health status +echo "Overall Health Status" +if [[ ${ISSUES_FOUND} -eq 0 ]]; then + echo "Status: HEALTHY" + echo "The database appears to be in good condition." +else + echo "Status: ISSUES DETECTED" + echo "Review the findings above. Consider running:" + echo " ./scripts/db-rebuild.sh ${DB_FILE}" +fi +echo + +exit ${ISSUES_FOUND} diff --git a/scripts/db-rebuild.sh b/scripts/db-rebuild.sh new file mode 100755 index 00000000..52e7e72f --- /dev/null +++ b/scripts/db-rebuild.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# Dump and rebuild a SQLite database, attempting to recover corrupted data. +# +# Usage: ./scripts/db-rebuild.sh [output-db] +# +# Features: +# - First attempts sqlite3 .recover (better at salvaging corrupt data) +# - Falls back to .dump if .recover fails or isn't available +# - Compares row counts between source and rebuilt +# - Runs integrity check on rebuilt database +# - Reports any rows that couldn't be imported + +set -euo pipefail + +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [output-db]" >&2 + exit 1 +fi + +SOURCE_DB="$1" + +if [[ ! -f "${SOURCE_DB}" ]]; then + echo "Error: Source database not found: ${SOURCE_DB}" >&2 + exit 1 +fi + +# Determine output path +if [[ $# -ge 2 ]]; then + OUTPUT_DB="$2" +else + BASENAME=$(basename "${SOURCE_DB}" .sqlite3) + BASENAME=$(basename "${BASENAME}" .db) + TIMESTAMP=$(date +"%Y-%m-%d_%H%M%S") + OUTPUT_DB="${BASENAME}-rebuilt-${TIMESTAMP}.sqlite3" +fi + +# Ensure output doesn't already exist +if [[ -f "${OUTPUT_DB}" ]]; then + echo "Error: Output file already exists: ${OUTPUT_DB}" >&2 + exit 1 +fi + +echo "Database Rebuild" +echo "Source: ${SOURCE_DB}" +echo "Output: ${OUTPUT_DB}" +echo "Date: $(date)" +echo + +# Create temp files for SQL dump and errors +DUMP_FILE=$(mktemp) +ERROR_FILE=$(mktemp) +trap "rm -f ${DUMP_FILE} ${ERROR_FILE}" EXIT + +# Try .recover first (better at salvaging corrupt data) +echo "1. Exporting data from source database" + +RECOVER_AVAILABLE=0 +if sqlite3 "${SOURCE_DB}" ".recover" >/dev/null 2>&1; then + RECOVER_AVAILABLE=1 +fi + +if [[ ${RECOVER_AVAILABLE} -eq 1 ]]; then + echo "Using .recover (recommended for corrupt databases)..." + if sqlite3 "${SOURCE_DB}" ".recover" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then + echo "Export method: .recover (success)" + else + echo "Warning: .recover had issues, falling back to .dump" + if [[ -s "${ERROR_FILE}" ]]; then + echo "Recover errors:" + cat "${ERROR_FILE}" + fi + echo + echo "Trying .dump fallback..." + if ! sqlite3 "${SOURCE_DB}" ".dump" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then + echo "Error: Both .recover and .dump failed" >&2 + if [[ -s "${ERROR_FILE}" ]]; then + cat "${ERROR_FILE}" >&2 + fi + exit 1 + fi + echo "Export method: .dump (fallback)" + fi +else + echo "Using .dump (sqlite3 version doesn't support .recover)..." + if ! sqlite3 "${SOURCE_DB}" ".dump" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then + echo "Error: .dump failed" >&2 + if [[ -s "${ERROR_FILE}" ]]; then + cat "${ERROR_FILE}" >&2 + fi + exit 1 + fi + echo "Export method: .dump" +fi + +DUMP_SIZE=$(ls -lh "${DUMP_FILE}" | awk '{print $5}') +echo "Dump size: ${DUMP_SIZE}" +echo + +# Import into new database +echo "2. Importing into new database" + +# Create new database and import +IMPORT_ERRORS=$(mktemp) +trap "rm -f ${DUMP_FILE} ${ERROR_FILE} ${IMPORT_ERRORS}" EXIT + +if sqlite3 "${OUTPUT_DB}" < "${DUMP_FILE}" 2>"${IMPORT_ERRORS}"; then + echo "Import: Success" +else + echo "Import: Completed with errors" +fi + +if [[ -s "${IMPORT_ERRORS}" ]]; then + echo + echo "Import warnings/errors:" + cat "${IMPORT_ERRORS}" +fi +echo + +# Compare row counts +echo "3. Row Count Comparison" + +TABLES=$(sqlite3 "${SOURCE_DB}" "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name;" 2>/dev/null || echo "") +TOTAL_SOURCE=0 +TOTAL_REBUILT=0 +ROWS_LOST=0 +TABLE_DATA="Table,Source,Rebuilt,Diff" + +for TABLE in ${TABLES}; do + SOURCE_COUNT=$(sqlite3 "${SOURCE_DB}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "ERROR") + REBUILT_COUNT=$(sqlite3 "${OUTPUT_DB}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "0") + + if [[ "${SOURCE_COUNT}" == "ERROR" ]]; then + DIFF="N/A" + elif [[ "${REBUILT_COUNT}" == "0" && "${SOURCE_COUNT}" != "0" ]]; then + DIFF="-${SOURCE_COUNT}" + ROWS_LOST=$((ROWS_LOST + SOURCE_COUNT)) + else + DIFF=$((REBUILT_COUNT - SOURCE_COUNT)) + if [[ ${DIFF} -lt 0 ]]; then + ROWS_LOST=$((ROWS_LOST + (-DIFF))) + fi + if [[ ${DIFF} -ge 0 ]]; then + DIFF="+${DIFF}" + fi + fi + + TABLE_DATA="${TABLE_DATA}"$'\n'"${TABLE},${SOURCE_COUNT},${REBUILT_COUNT},${DIFF}" + + if [[ "${SOURCE_COUNT}" != "ERROR" ]]; then + TOTAL_SOURCE=$((TOTAL_SOURCE + SOURCE_COUNT)) + fi + if [[ "${REBUILT_COUNT}" != "ERROR" && "${REBUILT_COUNT}" != "0" ]] || [[ "${REBUILT_COUNT}" == "0" ]]; then + TOTAL_REBUILT=$((TOTAL_REBUILT + REBUILT_COUNT)) + fi +done + +TOTAL_DIFF=$((TOTAL_REBUILT - TOTAL_SOURCE)) +if [[ ${TOTAL_DIFF} -ge 0 ]]; then + TOTAL_DIFF="+${TOTAL_DIFF}" +fi +TABLE_DATA="${TABLE_DATA}"$'\n'"---,---,---,---" +TABLE_DATA="${TABLE_DATA}"$'\n'"TOTAL,${TOTAL_SOURCE},${TOTAL_REBUILT},${TOTAL_DIFF}" +echo "${TABLE_DATA}" | column -t -s ',' +echo + +# Run integrity check on rebuilt database +echo "4. Integrity Check (Rebuilt Database)" +INTEGRITY_RESULT=$(sqlite3 "${OUTPUT_DB}" "PRAGMA integrity_check;" 2>&1) +if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then + echo "Integrity: PASSED" +else + echo "Integrity: FAILED" + echo "Details:" + echo "${INTEGRITY_RESULT}" | head -10 +fi + +FK_RESULT=$(sqlite3 "${OUTPUT_DB}" "PRAGMA foreign_key_check;" 2>&1) +if [[ -z "${FK_RESULT}" ]]; then + echo "Foreign keys: PASSED" +else + echo "Foreign keys: VIOLATIONS FOUND" + echo "${FK_RESULT}" | head -10 +fi +echo + +# File size comparison +echo "5. File Size Comparison" +SOURCE_SIZE=$(ls -lh "${SOURCE_DB}" | awk '{print $5}') +OUTPUT_SIZE=$(ls -lh "${OUTPUT_DB}" | awk '{print $5}') +echo "Source: ${SOURCE_SIZE}" +echo "Rebuilt: ${OUTPUT_SIZE}" +echo + +# Summary +echo "Summary" +echo "Rebuilt database: ${OUTPUT_DB}" + +if [[ ${ROWS_LOST} -gt 0 ]]; then + echo + echo "WARNING: ${ROWS_LOST} rows could not be recovered." + echo "Review the row count comparison above for details." +fi + +if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then + echo + echo "The rebuilt database passed integrity checks." + echo "You can verify it further with:" + echo " ./scripts/db-integrity.sh ${OUTPUT_DB}" +else + echo + echo "The rebuilt database has integrity issues." + echo "Manual intervention may be required." +fi From 7e7ba3ebca28ea36516dfb1e92d87b2b019be899 Mon Sep 17 00:00:00 2001 From: Carl Vitullo Date: Wed, 28 Jan 2026 14:19:31 -0500 Subject: [PATCH 2/2] Restructure db scripts to minimize downtime during recovery Integrity checks now run remotely against the live pod via better-sqlite3 (no downtime). Backups use the .backup() API for consistent snapshots without WAL/SHM copying. Recovery is a single pipeline that rebuilds directly on the PVC volume, avoiding slow network transfers of the full DB. - db-common.sh: shared constants and utilities - db-integrity.sh: remote checks via kubectl exec + node -e (readonly) - db-backup.sh: consistent backup via better-sqlite3 .backup() API - db-recover.sh: full pipeline replacing db-rebuild.sh + db-deploy.sh - Fixes wrong PVC name (was data-mod-bot-set-0, now mod-bot-pvc-mod-bot-set-0) Co-Authored-By: Claude Opus 4.5 --- notes/2026-01-28_1_db-scripts-restructure.md | 41 +++ scripts/db-backup.sh | 75 ++-- scripts/db-common.sh | 35 ++ scripts/db-deploy.sh | 169 --------- scripts/db-integrity.sh | 250 ++++++------- scripts/db-rebuild.sh | 213 ----------- scripts/db-recover.sh | 368 +++++++++++++++++++ 7 files changed, 608 insertions(+), 543 deletions(-) create mode 100644 notes/2026-01-28_1_db-scripts-restructure.md create mode 100755 scripts/db-common.sh delete mode 100755 scripts/db-deploy.sh delete mode 100755 scripts/db-rebuild.sh create mode 100755 scripts/db-recover.sh diff --git a/notes/2026-01-28_1_db-scripts-restructure.md b/notes/2026-01-28_1_db-scripts-restructure.md new file mode 100644 index 00000000..9da73e9f --- /dev/null +++ b/notes/2026-01-28_1_db-scripts-restructure.md @@ -0,0 +1,41 @@ +# Database Maintenance Scripts Restructure + +## What changed +Restructured 4 db maintenance scripts into 3 scripts (plus a shared common file) that minimize downtime during recovery. + +### Before (4 scripts) +- `db-integrity.sh` — local-only, required sqlite3 CLI and a local DB file +- `db-backup.sh` — raw `kubectl cp`, copied WAL/SHM separately (inconsistent state risk) +- `db-rebuild.sh` — local-only .recover/.dump +- `db-deploy.sh` — uploaded rebuilt DB to volume via temp pod (slow network transfer of full DB) + +### After (3 scripts + shared) +- `db-common.sh` — shared constants and utilities (sourced by all scripts) +- `db-integrity.sh` — remote integrity check via `kubectl exec` + `node -e` with `better-sqlite3` (readonly, no downtime) +- `db-backup.sh` — consistent backup via `better-sqlite3`'s `.backup()` API (single consistent file, no WAL/SHM needed) +- `db-recover.sh` — full pipeline: recovery pod → rebuild on volume → deploy (no large network transfers) + +## Key design decisions + +### `node -e` with `better-sqlite3` instead of sqlite3 CLI +The production image doesn't have sqlite3 CLI but does have better-sqlite3 (it's a dependency). Using `node -e` for remote operations avoids needing to install anything on the production pod. + +### Recovery pod approach +- Uses `alpine` + `apk add sqlite` for the recovery pod (needs sqlite3 CLI for .recover/.dump) +- RWO PVC constraint means: create recovery pod first (stays Pending) → scale down production (frees PVC) → recovery pod becomes Ready +- All I/O stays on the volume — no downloading/uploading the full DB over the network + +### PVC name +Confirmed PVC name is `mod-bot-pvc-mod-bot-set-0` (was wrong in old db-deploy.sh as `data-mod-bot-set-0`). + +### Cleanup trap ordering +Recovery pod must be deleted BEFORE scaling StatefulSet back up, because the recovery pod holds the RWO PVC. If StatefulSet tries to schedule while recovery pod has the PVC, the new pod will be stuck Pending. + +## Constants (in db-common.sh) +``` +STATEFULSET_NAME="mod-bot-set" +POD_NAME="mod-bot-set-0" +PVC_NAME="mod-bot-pvc-mod-bot-set-0" +REMOTE_DB_PATH="/data/mod-bot.sqlite3" +RECOVERY_POD_NAME="db-recovery-temp" +``` diff --git a/scripts/db-backup.sh b/scripts/db-backup.sh index 439f5b1c..b4d7bb2f 100755 --- a/scripts/db-backup.sh +++ b/scripts/db-backup.sh @@ -1,14 +1,18 @@ #!/bin/bash -# Backup the production database from Kubernetes pod to local destination. -# Usage: ./scripts/db-backup.sh [destination] +# Backup the production database using better-sqlite3's backup API. +# Produces a single consistent file without needing to copy WAL/SHM. # -# Arguments: -# destination - Optional path for backup file. Defaults to timestamped file in current directory. +# Usage: ./scripts/db-backup.sh [destination] set -euo pipefail -POD_NAME="mod-bot-set-0" -REMOTE_DB_PATH="/data/mod-bot.sqlite3" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +source "${SCRIPT_DIR}/db-common.sh" + +check_kubectl +check_sqlite3_local + +REMOTE_BACKUP_PATH="/data/mod-bot-backup-tmp.sqlite3" # Determine destination path if [[ $# -ge 1 ]]; then @@ -18,43 +22,50 @@ else DESTINATION="./mod-bot-backup-${TIMESTAMP}.sqlite3" fi -echo "Backing up database from ${POD_NAME}..." +# Cleanup: remove temp file from pod on exit +cleanup() { + echo "Cleaning up temporary backup on pod..." + kubectl exec "${POD_NAME}" -- rm -f "${REMOTE_BACKUP_PATH}" 2>/dev/null || true +} +trap cleanup EXIT + +echo "Backing up database from ${POD_NAME}" echo " Source: ${POD_NAME}:${REMOTE_DB_PATH}" echo " Destination: ${DESTINATION}" -echo +echo "" -# Copy the database file from the pod -if ! kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}" "${DESTINATION}"; then - echo "Error: Failed to copy database from pod" >&2 - exit 1 -fi +log_step "Creating consistent backup on pod via better-sqlite3" +kubectl exec "${POD_NAME}" -- node -e " +const Database = require('better-sqlite3'); +const db = new Database('${REMOTE_DB_PATH}', { readonly: true }); +db.backup('${REMOTE_BACKUP_PATH}') + .then(() => { + db.close(); + console.log('Backup created successfully on pod'); + }) + .catch(err => { + db.close(); + console.error('Backup failed: ' + err.message); + process.exit(1); + }); +" -# Also copy WAL and SHM files if they exist (for complete backup) -echo "Checking for WAL files..." -if kubectl exec "${POD_NAME}" -- test -f "${REMOTE_DB_PATH}-wal" 2>/dev/null; then - echo " Copying WAL file..." - kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}-wal" "${DESTINATION}-wal" 2>/dev/null || true -fi +log_step "Downloading backup to local machine" +kubectl cp "${POD_NAME}:${REMOTE_BACKUP_PATH}" "${DESTINATION}" -if kubectl exec "${POD_NAME}" -- test -f "${REMOTE_DB_PATH}-shm" 2>/dev/null; then - echo " Copying SHM file..." - kubectl cp "${POD_NAME}:${REMOTE_DB_PATH}-shm" "${DESTINATION}-shm" 2>/dev/null || true -fi - -# Show file size FILE_SIZE=$(ls -lh "${DESTINATION}" | awk '{print $5}') -echo -echo "Backup complete!" echo " File: ${DESTINATION}" echo " Size: ${FILE_SIZE}" -# Quick integrity check -echo -echo "Running quick integrity check..." +log_step "Running local integrity check" INTEGRITY=$(sqlite3 "${DESTINATION}" "PRAGMA quick_check;" 2>&1) if [[ "${INTEGRITY}" == "ok" ]]; then - echo " Integrity: OK" + echo " Integrity: PASSED" else echo " Integrity: ISSUES DETECTED" - echo " Run ./scripts/db-integrity.sh ${DESTINATION} for details" + echo " Run ./scripts/db-integrity.sh for details on the production database" + echo " The backup may reflect pre-existing corruption in the source." fi + +echo "" +echo "Backup complete: ${DESTINATION}" diff --git a/scripts/db-common.sh b/scripts/db-common.sh new file mode 100755 index 00000000..8a2e9e02 --- /dev/null +++ b/scripts/db-common.sh @@ -0,0 +1,35 @@ +#!/bin/bash +# Shared constants and utilities for database maintenance scripts. +# Source this file: source "$(dirname "$0")/db-common.sh" + +STATEFULSET_NAME="mod-bot-set" +POD_NAME="mod-bot-set-0" +PVC_NAME="mod-bot-pvc-mod-bot-set-0" +REMOTE_DB_PATH="/data/mod-bot.sqlite3" +RECOVERY_POD_NAME="db-recovery-temp" + +STEP_COUNTER=0 + +log_step() { + STEP_COUNTER=$((STEP_COUNTER + 1)) + echo "" + echo "=== Step ${STEP_COUNTER}: $1 ===" +} + +check_kubectl() { + if ! command -v kubectl &>/dev/null; then + echo "Error: kubectl is not installed or not in PATH" >&2 + exit 1 + fi + if ! kubectl cluster-info &>/dev/null; then + echo "Error: Cannot connect to Kubernetes cluster" >&2 + exit 1 + fi +} + +check_sqlite3_local() { + if ! command -v sqlite3 &>/dev/null; then + echo "Error: sqlite3 is not installed or not in PATH" >&2 + exit 1 + fi +} diff --git a/scripts/db-deploy.sh b/scripts/db-deploy.sh deleted file mode 100755 index 52305d90..00000000 --- a/scripts/db-deploy.sh +++ /dev/null @@ -1,169 +0,0 @@ -#!/bin/bash -# Deploy a repaired database to production using a temporary pod. -# -# Usage: ./scripts/db-deploy.sh -# -# This script will: -# 1. Verify local database integrity -# 2. Scale down the production StatefulSet -# 3. Create a temporary pod attached to the data volume -# 4. Copy the repaired database to the volume -# 5. Scale up the production StatefulSet -# 6. Delete the temporary pod - -set -euo pipefail - -STATEFULSET_NAME="mod-bot-set" -PVC_NAME="data-mod-bot-set-0" -REMOTE_DB_PATH="/data/mod-bot.sqlite3" -TEMP_POD_NAME="db-deploy-temp" -NAMESPACE="${NAMESPACE:-default}" - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " >&2 - echo "" >&2 - echo "Deploys a repaired database to production by:" >&2 - echo " 1. Scaling down the StatefulSet" >&2 - echo " 2. Mounting the volume in a temporary pod" >&2 - echo " 3. Copying the database" >&2 - echo " 4. Scaling up and cleaning up" >&2 - exit 1 -fi - -LOCAL_DB="$1" - -if [[ ! -f "${LOCAL_DB}" ]]; then - echo "Error: Database file not found: ${LOCAL_DB}" >&2 - exit 1 -fi - -cleanup() { - echo "" - echo "Cleaning up..." - kubectl delete pod "${TEMP_POD_NAME}" --ignore-not-found=true 2>/dev/null || true -} - -echo "Database Deployment" -echo "Source: ${LOCAL_DB}" -echo "Target: ${STATEFULSET_NAME} (${PVC_NAME})" -echo "Date: $(date)" -echo - -# Step 1: Verify local database integrity -echo "1. Verifying local database integrity" -INTEGRITY=$(sqlite3 "${LOCAL_DB}" "PRAGMA quick_check;" 2>&1) -if [[ "${INTEGRITY}" != "ok" ]]; then - echo "Error: Database failed integrity check" >&2 - echo "Details:" >&2 - echo "${INTEGRITY}" | head -10 >&2 - exit 1 -fi -LOCAL_SIZE=$(ls -lh "${LOCAL_DB}" | awk '{print $5}') -echo "Status: PASSED (${LOCAL_SIZE})" -echo - -# Confirm before proceeding -echo "This will:" -echo " - Scale down ${STATEFULSET_NAME} (production will be offline)" -echo " - Replace the database on ${PVC_NAME}" -echo " - Scale back up" -echo "" -read -p "Proceed? [y/N] " -n 1 -r -echo -if [[ ! $REPLY =~ ^[Yy]$ ]]; then - echo "Aborted." - exit 1 -fi -echo - -# Step 2: Scale down the StatefulSet -echo "2. Scaling down ${STATEFULSET_NAME}" -CURRENT_REPLICAS=$(kubectl get statefulset "${STATEFULSET_NAME}" -o jsonpath='{.spec.replicas}') -echo "Current replicas: ${CURRENT_REPLICAS}" - -kubectl scale statefulset "${STATEFULSET_NAME}" --replicas=0 -echo "Waiting for pod to terminate..." -kubectl wait --for=delete pod/"${STATEFULSET_NAME}-0" --timeout=120s 2>/dev/null || true -echo "StatefulSet scaled down" -echo - -# Set up cleanup trap after scaling down -trap cleanup EXIT - -# Step 3: Create temporary pod -echo "3. Creating temporary pod" -cat </dev/null; then - EXISTING_SIZE=$(kubectl exec "${TEMP_POD_NAME}" -- ls -lh "${REMOTE_DB_PATH}" | awk '{print $5}') - echo "Existing database size: ${EXISTING_SIZE}" - - # Remove WAL/SHM files - kubectl exec "${TEMP_POD_NAME}" -- rm -f "${REMOTE_DB_PATH}-wal" "${REMOTE_DB_PATH}-shm" 2>/dev/null || true -else - echo "No existing database found" -fi - -echo "Uploading ${LOCAL_SIZE}..." -kubectl cp "${LOCAL_DB}" "${TEMP_POD_NAME}:${REMOTE_DB_PATH}" - -# Verify the copy -REMOTE_SIZE=$(kubectl exec "${TEMP_POD_NAME}" -- ls -lh "${REMOTE_DB_PATH}" | awk '{print $5}') -echo "Uploaded size: ${REMOTE_SIZE}" -echo - -# Step 5: Scale up StatefulSet -echo "5. Scaling up ${STATEFULSET_NAME}" -kubectl scale statefulset "${STATEFULSET_NAME}" --replicas="${CURRENT_REPLICAS}" -echo "Waiting for pod to be ready..." -kubectl wait --for=condition=Ready pod/"${STATEFULSET_NAME}-0" --timeout=300s -echo "StatefulSet scaled up" -echo - -# Step 6: Delete temporary pod (handled by trap, but do it explicitly) -echo "6. Cleaning up temporary pod" -trap - EXIT -kubectl delete pod "${TEMP_POD_NAME}" --wait=false -echo "Temporary pod deleted" -echo - -# Verify -echo "7. Verifying deployment" -sleep 2 # Give the app a moment to start -REMOTE_CHECK=$(kubectl exec "${STATEFULSET_NAME}-0" -- sqlite3 "${REMOTE_DB_PATH}" "PRAGMA quick_check;" 2>&1 || echo "ERROR") -if [[ "${REMOTE_CHECK}" == "ok" ]]; then - echo "Remote integrity: PASSED" -else - echo "Warning: Could not verify remote database" - echo "${REMOTE_CHECK}" -fi -echo - -echo "Deployment complete!" \ No newline at end of file diff --git a/scripts/db-integrity.sh b/scripts/db-integrity.sh index 9ddc0779..c066754e 100755 --- a/scripts/db-integrity.sh +++ b/scripts/db-integrity.sh @@ -1,137 +1,129 @@ #!/bin/bash -# Run integrity checks on a local SQLite database and display a summary. +# Run integrity checks on the production database via kubectl exec. +# Read-only, non-invasive. Runs against the live pod using better-sqlite3. # -# Usage: ./scripts/db-integrity.sh -# -# Performs: -# - PRAGMA integrity_check -# - PRAGMA foreign_key_check -# - REINDEX with constraint violation detection -# - Table row counts -# - Overall health status report +# Usage: ./scripts/db-integrity.sh set -euo pipefail -if [[ $# -lt 1 ]]; then - echo "Usage: $0 " >&2 - exit 1 -fi - -DB_FILE="$1" +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +source "${SCRIPT_DIR}/db-common.sh" -if [[ ! -f "${DB_FILE}" ]]; then - echo "Error: Database file not found: ${DB_FILE}" >&2 - exit 1 -fi +check_kubectl -echo "Database Integrity Report" -echo "File: ${DB_FILE}" -echo "Size: $(ls -lh "${DB_FILE}" | awk '{print $5}')" +echo "Database Integrity Report (Remote)" +echo "Pod: ${POD_NAME}" +echo "Database: ${REMOTE_DB_PATH}" echo "Date: $(date)" -echo - -ISSUES_FOUND=0 - -# Integrity check (use quick_check first, it's faster and less likely to crash on corrupt DBs) -echo "1. PRAGMA quick_check" -INTEGRITY_RESULT=$(sqlite3 "${DB_FILE}" "PRAGMA quick_check;" 2>&1) -if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then - echo "Status: PASSED" -else - echo "Status: FAILED" - echo "Details:" - echo "${INTEGRITY_RESULT}" | head -10 - if [[ $(echo "${INTEGRITY_RESULT}" | wc -l) -gt 10 ]]; then - echo "... (truncated, $(echo "${INTEGRITY_RESULT}" | wc -l) total issues)" - fi - ISSUES_FOUND=1 -fi -echo - -# Foreign key check -echo "2. PRAGMA foreign_key_check" -FK_RESULT=$(sqlite3 "${DB_FILE}" "PRAGMA foreign_key_check;" 2>&1) -if [[ -z "${FK_RESULT}" ]]; then - echo "Status: PASSED (no violations)" -else - echo "Status: FAILED" - echo "Violations found:" - echo "${FK_RESULT}" | head -10 - if [[ $(echo "${FK_RESULT}" | wc -l) -gt 10 ]]; then - echo "... (truncated)" - fi - ISSUES_FOUND=1 -fi -echo - -# REINDEX attempt (may fail on corrupt databases) -echo "3. REINDEX check" -if REINDEX_RESULT=$(sqlite3 "${DB_FILE}" "REINDEX;" 2>&1); then - if [[ -z "${REINDEX_RESULT}" ]]; then - echo "Status: PASSED" - else - echo "Status: ISSUES DETECTED" - echo "Details:" - echo "${REINDEX_RESULT}" | head -10 - if [[ $(echo "${REINDEX_RESULT}" | wc -l) -gt 10 ]]; then - echo "... (truncated)" - fi - ISSUES_FOUND=1 - fi -else - echo "Status: FAILED (sqlite3 crashed or errored)" - echo "This usually indicates severe corruption." - if [[ -n "${REINDEX_RESULT}" ]]; then - echo "Details:" - echo "${REINDEX_RESULT}" | head -10 - if [[ $(echo "${REINDEX_RESULT}" | wc -l) -gt 10 ]]; then - echo "... (truncated)" - fi - fi - ISSUES_FOUND=1 -fi -echo - -# Table row counts -echo "4. Table Statistics" - -TABLES=$(sqlite3 "${DB_FILE}" "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name;") -TOTAL_ROWS=0 -TABLE_DATA="Table,Rows" - -for TABLE in ${TABLES}; do - ROW_COUNT=$(sqlite3 "${DB_FILE}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "ERROR") - TABLE_DATA="${TABLE_DATA}"$'\n'"${TABLE},${ROW_COUNT}" - if [[ "${ROW_COUNT}" != "ERROR" ]]; then - TOTAL_ROWS=$((TOTAL_ROWS + ROW_COUNT)) - fi -done - -TABLE_DATA="${TABLE_DATA}"$'\n'"---,---" -TABLE_DATA="${TABLE_DATA}"$'\n'"TOTAL,${TOTAL_ROWS}" -echo "${TABLE_DATA}" | column -t -s ',' -echo - -# SQLite version and settings -echo "5. Database Configuration" -echo "SQLite version: $(sqlite3 "${DB_FILE}" "SELECT sqlite_version();")" -echo "Journal mode: $(sqlite3 "${DB_FILE}" "PRAGMA journal_mode;")" -echo "Page size: $(sqlite3 "${DB_FILE}" "PRAGMA page_size;")" -echo "Page count: $(sqlite3 "${DB_FILE}" "PRAGMA page_count;")" -echo "Freelist count: $(sqlite3 "${DB_FILE}" "PRAGMA freelist_count;")" -echo "Auto vacuum: $(sqlite3 "${DB_FILE}" "PRAGMA auto_vacuum;")" -echo - -# Overall health status -echo "Overall Health Status" -if [[ ${ISSUES_FOUND} -eq 0 ]]; then - echo "Status: HEALTHY" - echo "The database appears to be in good condition." -else - echo "Status: ISSUES DETECTED" - echo "Review the findings above. Consider running:" - echo " ./scripts/db-rebuild.sh ${DB_FILE}" -fi -echo - -exit ${ISSUES_FOUND} +echo "" + +# Run all checks in a single node -e invocation to minimize kubectl exec overhead. +# The node script outputs formatted text and exits with code 1 if issues are found. +kubectl exec "${POD_NAME}" -- node -e " +const Database = require('better-sqlite3'); +let db; +try { + db = new Database('${REMOTE_DB_PATH}', { readonly: true }); +} catch (e) { + console.log('Error: Could not open database: ' + e.message); + process.exit(1); +} + +let issues = 0; + +// 1. quick_check +console.log('1. PRAGMA quick_check'); +try { + const rows = db.pragma('quick_check'); + const results = rows.map(r => r.quick_check); + if (results.length === 1 && results[0] === 'ok') { + console.log(' Status: PASSED'); + } else { + console.log(' Status: FAILED'); + console.log(' Details:'); + results.slice(0, 10).forEach(r => console.log(' ' + r)); + if (results.length > 10) console.log(' ... (' + results.length + ' total issues)'); + issues++; + } +} catch (e) { + console.log(' Status: ERROR'); + console.log(' ' + e.message); + issues++; +} + +// 2. foreign_key_check +console.log(''); +console.log('2. PRAGMA foreign_key_check'); +try { + const fkRows = db.pragma('foreign_key_check'); + if (fkRows.length === 0) { + console.log(' Status: PASSED (no violations)'); + } else { + console.log(' Status: FAILED'); + console.log(' Violations: ' + fkRows.length); + fkRows.slice(0, 10).forEach(v => console.log(' ' + JSON.stringify(v))); + if (fkRows.length > 10) console.log(' ... (truncated)'); + issues++; + } +} catch (e) { + console.log(' Status: ERROR'); + console.log(' ' + e.message); + issues++; +} + +// 3. Table row counts +console.log(''); +console.log('3. Table Row Counts'); +try { + const tables = db.prepare(\"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name\").all(); + let total = 0; + const entries = []; + for (const t of tables) { + try { + const row = db.prepare('SELECT COUNT(*) as c FROM \"' + t.name + '\"').get(); + entries.push([t.name, String(row.c)]); + total += row.c; + } catch (e) { + entries.push([t.name, 'ERROR']); + } + } + const maxName = Math.max(...entries.map(e => e[0].length), 5); + const maxCount = Math.max(...entries.map(e => e[1].length), 5); + entries.forEach(([name, count]) => { + console.log(' ' + name.padEnd(maxName + 2) + count.padStart(maxCount)); + }); + console.log(' ' + '---'.padEnd(maxName + 2) + '---'.padStart(maxCount)); + console.log(' ' + 'TOTAL'.padEnd(maxName + 2) + String(total).padStart(maxCount)); +} catch (e) { + console.log(' Error: ' + e.message); +} + +// 4. DB config +console.log(''); +console.log('4. Database Configuration'); +try { + console.log(' Journal mode: ' + db.pragma('journal_mode')[0].journal_mode); + console.log(' Page size: ' + db.pragma('page_size')[0].page_size); + console.log(' Page count: ' + db.pragma('page_count')[0].page_count); + console.log(' Freelist count: ' + db.pragma('freelist_count')[0].freelist_count); +} catch (e) { + console.log(' Error: ' + e.message); +} + +db.close(); + +// 5. Overall health +console.log(''); +console.log('5. Overall Health Status'); +if (issues === 0) { + console.log(' Status: HEALTHY'); + console.log(' The database appears to be in good condition.'); +} else { + console.log(' Status: ISSUES DETECTED'); + console.log(' Review the findings above. Consider running:'); + console.log(' ./scripts/db-recover.sh'); +} +console.log(''); + +process.exit(issues > 0 ? 1 : 0); +" diff --git a/scripts/db-rebuild.sh b/scripts/db-rebuild.sh deleted file mode 100755 index 52e7e72f..00000000 --- a/scripts/db-rebuild.sh +++ /dev/null @@ -1,213 +0,0 @@ -#!/bin/bash -# Dump and rebuild a SQLite database, attempting to recover corrupted data. -# -# Usage: ./scripts/db-rebuild.sh [output-db] -# -# Features: -# - First attempts sqlite3 .recover (better at salvaging corrupt data) -# - Falls back to .dump if .recover fails or isn't available -# - Compares row counts between source and rebuilt -# - Runs integrity check on rebuilt database -# - Reports any rows that couldn't be imported - -set -euo pipefail - -if [[ $# -lt 1 ]]; then - echo "Usage: $0 [output-db]" >&2 - exit 1 -fi - -SOURCE_DB="$1" - -if [[ ! -f "${SOURCE_DB}" ]]; then - echo "Error: Source database not found: ${SOURCE_DB}" >&2 - exit 1 -fi - -# Determine output path -if [[ $# -ge 2 ]]; then - OUTPUT_DB="$2" -else - BASENAME=$(basename "${SOURCE_DB}" .sqlite3) - BASENAME=$(basename "${BASENAME}" .db) - TIMESTAMP=$(date +"%Y-%m-%d_%H%M%S") - OUTPUT_DB="${BASENAME}-rebuilt-${TIMESTAMP}.sqlite3" -fi - -# Ensure output doesn't already exist -if [[ -f "${OUTPUT_DB}" ]]; then - echo "Error: Output file already exists: ${OUTPUT_DB}" >&2 - exit 1 -fi - -echo "Database Rebuild" -echo "Source: ${SOURCE_DB}" -echo "Output: ${OUTPUT_DB}" -echo "Date: $(date)" -echo - -# Create temp files for SQL dump and errors -DUMP_FILE=$(mktemp) -ERROR_FILE=$(mktemp) -trap "rm -f ${DUMP_FILE} ${ERROR_FILE}" EXIT - -# Try .recover first (better at salvaging corrupt data) -echo "1. Exporting data from source database" - -RECOVER_AVAILABLE=0 -if sqlite3 "${SOURCE_DB}" ".recover" >/dev/null 2>&1; then - RECOVER_AVAILABLE=1 -fi - -if [[ ${RECOVER_AVAILABLE} -eq 1 ]]; then - echo "Using .recover (recommended for corrupt databases)..." - if sqlite3 "${SOURCE_DB}" ".recover" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then - echo "Export method: .recover (success)" - else - echo "Warning: .recover had issues, falling back to .dump" - if [[ -s "${ERROR_FILE}" ]]; then - echo "Recover errors:" - cat "${ERROR_FILE}" - fi - echo - echo "Trying .dump fallback..." - if ! sqlite3 "${SOURCE_DB}" ".dump" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then - echo "Error: Both .recover and .dump failed" >&2 - if [[ -s "${ERROR_FILE}" ]]; then - cat "${ERROR_FILE}" >&2 - fi - exit 1 - fi - echo "Export method: .dump (fallback)" - fi -else - echo "Using .dump (sqlite3 version doesn't support .recover)..." - if ! sqlite3 "${SOURCE_DB}" ".dump" > "${DUMP_FILE}" 2>"${ERROR_FILE}"; then - echo "Error: .dump failed" >&2 - if [[ -s "${ERROR_FILE}" ]]; then - cat "${ERROR_FILE}" >&2 - fi - exit 1 - fi - echo "Export method: .dump" -fi - -DUMP_SIZE=$(ls -lh "${DUMP_FILE}" | awk '{print $5}') -echo "Dump size: ${DUMP_SIZE}" -echo - -# Import into new database -echo "2. Importing into new database" - -# Create new database and import -IMPORT_ERRORS=$(mktemp) -trap "rm -f ${DUMP_FILE} ${ERROR_FILE} ${IMPORT_ERRORS}" EXIT - -if sqlite3 "${OUTPUT_DB}" < "${DUMP_FILE}" 2>"${IMPORT_ERRORS}"; then - echo "Import: Success" -else - echo "Import: Completed with errors" -fi - -if [[ -s "${IMPORT_ERRORS}" ]]; then - echo - echo "Import warnings/errors:" - cat "${IMPORT_ERRORS}" -fi -echo - -# Compare row counts -echo "3. Row Count Comparison" - -TABLES=$(sqlite3 "${SOURCE_DB}" "SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name;" 2>/dev/null || echo "") -TOTAL_SOURCE=0 -TOTAL_REBUILT=0 -ROWS_LOST=0 -TABLE_DATA="Table,Source,Rebuilt,Diff" - -for TABLE in ${TABLES}; do - SOURCE_COUNT=$(sqlite3 "${SOURCE_DB}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "ERROR") - REBUILT_COUNT=$(sqlite3 "${OUTPUT_DB}" "SELECT COUNT(*) FROM \"${TABLE}\";" 2>/dev/null || echo "0") - - if [[ "${SOURCE_COUNT}" == "ERROR" ]]; then - DIFF="N/A" - elif [[ "${REBUILT_COUNT}" == "0" && "${SOURCE_COUNT}" != "0" ]]; then - DIFF="-${SOURCE_COUNT}" - ROWS_LOST=$((ROWS_LOST + SOURCE_COUNT)) - else - DIFF=$((REBUILT_COUNT - SOURCE_COUNT)) - if [[ ${DIFF} -lt 0 ]]; then - ROWS_LOST=$((ROWS_LOST + (-DIFF))) - fi - if [[ ${DIFF} -ge 0 ]]; then - DIFF="+${DIFF}" - fi - fi - - TABLE_DATA="${TABLE_DATA}"$'\n'"${TABLE},${SOURCE_COUNT},${REBUILT_COUNT},${DIFF}" - - if [[ "${SOURCE_COUNT}" != "ERROR" ]]; then - TOTAL_SOURCE=$((TOTAL_SOURCE + SOURCE_COUNT)) - fi - if [[ "${REBUILT_COUNT}" != "ERROR" && "${REBUILT_COUNT}" != "0" ]] || [[ "${REBUILT_COUNT}" == "0" ]]; then - TOTAL_REBUILT=$((TOTAL_REBUILT + REBUILT_COUNT)) - fi -done - -TOTAL_DIFF=$((TOTAL_REBUILT - TOTAL_SOURCE)) -if [[ ${TOTAL_DIFF} -ge 0 ]]; then - TOTAL_DIFF="+${TOTAL_DIFF}" -fi -TABLE_DATA="${TABLE_DATA}"$'\n'"---,---,---,---" -TABLE_DATA="${TABLE_DATA}"$'\n'"TOTAL,${TOTAL_SOURCE},${TOTAL_REBUILT},${TOTAL_DIFF}" -echo "${TABLE_DATA}" | column -t -s ',' -echo - -# Run integrity check on rebuilt database -echo "4. Integrity Check (Rebuilt Database)" -INTEGRITY_RESULT=$(sqlite3 "${OUTPUT_DB}" "PRAGMA integrity_check;" 2>&1) -if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then - echo "Integrity: PASSED" -else - echo "Integrity: FAILED" - echo "Details:" - echo "${INTEGRITY_RESULT}" | head -10 -fi - -FK_RESULT=$(sqlite3 "${OUTPUT_DB}" "PRAGMA foreign_key_check;" 2>&1) -if [[ -z "${FK_RESULT}" ]]; then - echo "Foreign keys: PASSED" -else - echo "Foreign keys: VIOLATIONS FOUND" - echo "${FK_RESULT}" | head -10 -fi -echo - -# File size comparison -echo "5. File Size Comparison" -SOURCE_SIZE=$(ls -lh "${SOURCE_DB}" | awk '{print $5}') -OUTPUT_SIZE=$(ls -lh "${OUTPUT_DB}" | awk '{print $5}') -echo "Source: ${SOURCE_SIZE}" -echo "Rebuilt: ${OUTPUT_SIZE}" -echo - -# Summary -echo "Summary" -echo "Rebuilt database: ${OUTPUT_DB}" - -if [[ ${ROWS_LOST} -gt 0 ]]; then - echo - echo "WARNING: ${ROWS_LOST} rows could not be recovered." - echo "Review the row count comparison above for details." -fi - -if [[ "${INTEGRITY_RESULT}" == "ok" ]]; then - echo - echo "The rebuilt database passed integrity checks." - echo "You can verify it further with:" - echo " ./scripts/db-integrity.sh ${OUTPUT_DB}" -else - echo - echo "The rebuilt database has integrity issues." - echo "Manual intervention may be required." -fi diff --git a/scripts/db-recover.sh b/scripts/db-recover.sh new file mode 100755 index 00000000..17edbb7a --- /dev/null +++ b/scripts/db-recover.sh @@ -0,0 +1,368 @@ +#!/bin/bash +# Full database recovery pipeline: backup, rebuild, and deploy in one operation. +# Rebuilds directly on the PVC volume to avoid slow network transfers. +# +# Usage: ./scripts/db-recover.sh +# +# Steps: +# 1. Create recovery pod (stays Pending until PVC is free) +# 2. Scale down StatefulSet to free PVC +# 3. Wait for recovery pod to become Ready +# 4. Install sqlite3 on recovery pod +# 5. Back up corrupt files on volume +# 6. Checkpoint WAL (best-effort) +# 7. Confirm corruption on volume +# 8. Rebuild database on volume (.recover or .dump) +# 9. Verify rebuilt database +# 10. Compare row counts +# 11. Confirm deployment (interactive) +# 12. Swap rebuilt DB into place +# 13. Scale up StatefulSet +# 14. Wait for readiness +# 15. Verify deployment via node -e +# 16. Clean up recovery pod + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +source "${SCRIPT_DIR}/db-common.sh" + +check_kubectl + +# State tracking for cleanup +SCALED_DOWN=0 +ORIGINAL_REPLICAS=1 +RECOVERY_POD_CREATED=0 +TIMESTAMP=$(date +"%Y-%m-%d_%H%M%S") +CORRUPT_BACKUP_DIR="/data/corrupt-bak-${TIMESTAMP}" +REBUILT_DB_PATH="/data/mod-bot-rebuilt.sqlite3" + +cleanup() { + echo "" + echo "=== Cleanup ===" + + # Delete recovery pod FIRST — it holds the RWO PVC and must release it + # before the StatefulSet pod can mount the volume. + if [[ ${RECOVERY_POD_CREATED} -eq 1 ]]; then + echo "Deleting recovery pod (to free PVC)..." + kubectl delete pod "${RECOVERY_POD_NAME}" --ignore-not-found=true 2>/dev/null || true + # Wait briefly for pod termination so PVC is released + kubectl wait --for=delete pod/"${RECOVERY_POD_NAME}" --timeout=60s 2>/dev/null || true + fi + + if [[ ${SCALED_DOWN} -eq 1 ]]; then + echo "Scaling StatefulSet back up to ${ORIGINAL_REPLICAS} replicas..." + kubectl scale statefulset "${STATEFULSET_NAME}" --replicas="${ORIGINAL_REPLICAS}" 2>/dev/null || true + fi + + if [[ ${SCALED_DOWN} -eq 1 || ${RECOVERY_POD_CREATED} -eq 1 ]]; then + echo "" + echo "Corrupt backup remains on volume at: ${CORRUPT_BACKUP_DIR}" + echo "You may need to inspect it manually." + fi +} +trap cleanup EXIT + +echo "Database Recovery Pipeline" +echo "Date: ${TIMESTAMP}" +echo "" +echo "This script will:" +echo " - Create a recovery pod attached to the data volume" +echo " - Scale down production (bot will be offline)" +echo " - Rebuild the database on the volume" +echo " - Scale production back up" +echo "" +read -p "Continue? [y/N] " -n 1 -r +echo "" +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborted." + trap - EXIT + exit 0 +fi + +# ─── Step 1: Create recovery pod ─────────────────────────────────────────────── +log_step "Creating recovery pod (will stay Pending until PVC is free)" + +cat </dev/null || true +echo "StatefulSet scaled down" + +# ─── Step 3: Wait for recovery pod ──────────────────────────────────────────── +log_step "Waiting for recovery pod to become Ready" + +kubectl wait --for=condition=Ready pod/"${RECOVERY_POD_NAME}" --timeout=120s +echo "Recovery pod is Ready" + +# ─── Step 4: Install sqlite3 ────────────────────────────────────────────────── +log_step "Installing sqlite3 on recovery pod" + +kubectl exec "${RECOVERY_POD_NAME}" -- apk add --no-cache sqlite 2>&1 | tail -1 +echo "sqlite3 installed" + +# ─── Step 5: Back up corrupt files ──────────────────────────────────────────── +log_step "Backing up corrupt files on volume" + +kubectl exec "${RECOVERY_POD_NAME}" -- mkdir -p "${CORRUPT_BACKUP_DIR}" +kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " + cp '${REMOTE_DB_PATH}' '${CORRUPT_BACKUP_DIR}/' 2>/dev/null || true + cp '${REMOTE_DB_PATH}-wal' '${CORRUPT_BACKUP_DIR}/' 2>/dev/null || true + cp '${REMOTE_DB_PATH}-shm' '${CORRUPT_BACKUP_DIR}/' 2>/dev/null || true + ls -lh '${CORRUPT_BACKUP_DIR}/' +" +echo "Corrupt files backed up to ${CORRUPT_BACKUP_DIR}" + +# ─── Step 6: Checkpoint WAL ────────────────────────────────────────────────── +log_step "Attempting WAL checkpoint (best-effort)" + +if kubectl exec "${RECOVERY_POD_NAME}" -- sqlite3 "${REMOTE_DB_PATH}" "PRAGMA wal_checkpoint(TRUNCATE);" 2>&1; then + echo "WAL checkpoint succeeded" +else + echo "WAL checkpoint failed (expected if database is corrupt)" +fi + +# ─── Step 7: Confirm corruption ────────────────────────────────────────────── +log_step "Checking database integrity on volume" + +QUICK_CHECK=$(kubectl exec "${RECOVERY_POD_NAME}" -- sqlite3 "${REMOTE_DB_PATH}" "PRAGMA quick_check;" 2>&1 || true) +echo "${QUICK_CHECK}" | head -5 + +if [[ "${QUICK_CHECK}" == "ok" ]]; then + echo "" + echo "WARNING: Database passed quick_check — it may not be corrupt." + read -p "Continue with rebuild anyway? [y/N] " -n 1 -r + echo "" + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborted. Scaling back up." + # cleanup trap handles scale-up + exit 0 + fi +fi + +# ─── Step 8: Rebuild database on volume ────────────────────────────────────── +log_step "Rebuilding database on volume" + +# Remove any leftover rebuilt file +kubectl exec "${RECOVERY_POD_NAME}" -- rm -f "${REBUILT_DB_PATH}" + +# Test if .recover is available by trying it against a temp empty DB +RECOVER_AVAILABLE=0 +if kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " + sqlite3 /tmp/test-recover.db 'CREATE TABLE t(x);' && \ + sqlite3 /tmp/test-recover.db '.recover' >/dev/null 2>&1 && \ + rm -f /tmp/test-recover.db +" 2>/dev/null; then + RECOVER_AVAILABLE=1 +fi + +REBUILD_METHOD="" +if [[ ${RECOVER_AVAILABLE} -eq 1 ]]; then + echo "Using .recover (preferred for corrupt databases)..." + if kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " + sqlite3 '${REMOTE_DB_PATH}' '.recover' | sqlite3 '${REBUILT_DB_PATH}' + " 2>&1; then + REBUILD_METHOD=".recover" + echo "Rebuild via .recover succeeded" + else + echo ".recover failed, falling back to .dump..." + fi +fi + +if [[ -z "${REBUILD_METHOD}" ]]; then + echo "Using .dump..." + if kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " + sqlite3 '${REMOTE_DB_PATH}' '.dump' | sqlite3 '${REBUILT_DB_PATH}' + " 2>&1; then + REBUILD_METHOD=".dump" + echo "Rebuild via .dump succeeded" + else + echo "Error: Both .recover and .dump failed" >&2 + echo "Manual intervention required. Corrupt backup at: ${CORRUPT_BACKUP_DIR}" >&2 + exit 1 + fi +fi + +# ─── Step 9: Verify rebuilt database ───────────────────────────────────────── +log_step "Verifying rebuilt database" + +REBUILT_INTEGRITY=$(kubectl exec "${RECOVERY_POD_NAME}" -- sqlite3 "${REBUILT_DB_PATH}" "PRAGMA integrity_check;" 2>&1) +if [[ "${REBUILT_INTEGRITY}" == "ok" ]]; then + echo " Integrity check: PASSED" +else + echo " Integrity check: FAILED" + echo " ${REBUILT_INTEGRITY}" | head -5 + echo "" + echo "WARNING: Rebuilt database has integrity issues." + read -p "Continue anyway? [y/N] " -n 1 -r + echo "" + if [[ ! $REPLY =~ ^[Yy]$ ]]; then + exit 1 + fi +fi + +REBUILT_FK=$(kubectl exec "${RECOVERY_POD_NAME}" -- sqlite3 "${REBUILT_DB_PATH}" "PRAGMA foreign_key_check;" 2>&1) +if [[ -z "${REBUILT_FK}" ]]; then + echo " Foreign key check: PASSED" +else + echo " Foreign key check: VIOLATIONS FOUND" + echo " ${REBUILT_FK}" | head -5 +fi + +# ─── Step 10: Compare row counts ───────────────────────────────────────────── +log_step "Comparing row counts (corrupt vs rebuilt)" + +kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " +TABLES=\$(sqlite3 '${REMOTE_DB_PATH}' \"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name;\" 2>/dev/null) + +printf '%-30s %10s %10s %10s\n' 'Table' 'Corrupt' 'Rebuilt' 'Diff' +printf '%-30s %10s %10s %10s\n' '-----' '-------' '-------' '----' + +TOTAL_CORRUPT=0 +TOTAL_REBUILT=0 + +for TABLE in \${TABLES}; do + CORRUPT_COUNT=\$(sqlite3 '${REMOTE_DB_PATH}' \"SELECT COUNT(*) FROM \\\"\${TABLE}\\\";\" 2>/dev/null || echo 'ERR') + REBUILT_COUNT=\$(sqlite3 '${REBUILT_DB_PATH}' \"SELECT COUNT(*) FROM \\\"\${TABLE}\\\";\" 2>/dev/null || echo '0') + + if [ \"\${CORRUPT_COUNT}\" = 'ERR' ]; then + DIFF='N/A' + else + DIFF=\$((REBUILT_COUNT - CORRUPT_COUNT)) + TOTAL_CORRUPT=\$((TOTAL_CORRUPT + CORRUPT_COUNT)) + fi + TOTAL_REBUILT=\$((TOTAL_REBUILT + REBUILT_COUNT)) + + printf '%-30s %10s %10s %10s\n' \"\${TABLE}\" \"\${CORRUPT_COUNT}\" \"\${REBUILT_COUNT}\" \"\${DIFF}\" +done + +printf '%-30s %10s %10s %10s\n' '-----' '-------' '-------' '----' +printf '%-30s %10s %10s %10s\n' 'TOTAL' \"\${TOTAL_CORRUPT}\" \"\${TOTAL_REBUILT}\" \"\$((TOTAL_REBUILT - TOTAL_CORRUPT))\" +" + +# ─── Step 11: Confirm deployment ───────────────────────────────────────────── +log_step "Confirm deployment" + +CORRUPT_SIZE=$(kubectl exec "${RECOVERY_POD_NAME}" -- ls -lh "${REMOTE_DB_PATH}" 2>/dev/null | awk '{print $5}') +REBUILT_SIZE=$(kubectl exec "${RECOVERY_POD_NAME}" -- ls -lh "${REBUILT_DB_PATH}" 2>/dev/null | awk '{print $5}') + +echo "" +echo " Rebuild method: ${REBUILD_METHOD}" +echo " Corrupt DB size: ${CORRUPT_SIZE}" +echo " Rebuilt DB size: ${REBUILT_SIZE}" +echo " Corrupt backup: ${CORRUPT_BACKUP_DIR}" +echo "" +echo "This will replace the production database with the rebuilt copy." +read -p "Deploy rebuilt database? [y/N] " -n 1 -r +echo "" +if [[ ! $REPLY =~ ^[Yy]$ ]]; then + echo "Aborted. Corrupt backup remains at: ${CORRUPT_BACKUP_DIR}" + echo "Rebuilt DB remains at: ${REBUILT_DB_PATH}" + exit 0 +fi + +# ─── Step 12: Swap into place ──────────────────────────────────────────────── +log_step "Swapping rebuilt database into place" + +kubectl exec "${RECOVERY_POD_NAME}" -- sh -c " + mv '${REBUILT_DB_PATH}' '${REMOTE_DB_PATH}' + rm -f '${REMOTE_DB_PATH}-wal' '${REMOTE_DB_PATH}-shm' +" +echo "Database swapped" + +# ─── Step 13: Scale up StatefulSet ─────────────────────────────────────────── +log_step "Scaling up ${STATEFULSET_NAME}" + +kubectl scale statefulset "${STATEFULSET_NAME}" --replicas="${ORIGINAL_REPLICAS}" +SCALED_DOWN=0 +echo "StatefulSet scaling up to ${ORIGINAL_REPLICAS} replicas" + +# ─── Step 14: Wait for readiness ───────────────────────────────────────────── +log_step "Waiting for pod readiness" + +kubectl wait --for=condition=Ready pod/"${POD_NAME}" --timeout=300s +echo "Pod ${POD_NAME} is Ready" + +# ─── Step 15: Verify deployment ────────────────────────────────────────────── +log_step "Verifying deployment" + +# Give the app a moment to initialize +sleep 3 + +kubectl exec "${POD_NAME}" -- node -e " +const Database = require('better-sqlite3'); +const db = new Database('${REMOTE_DB_PATH}', { readonly: true }); + +// quick_check +const rows = db.pragma('quick_check'); +const results = rows.map(r => r.quick_check); +if (results.length === 1 && results[0] === 'ok') { + console.log('Integrity check: PASSED'); +} else { + console.log('Integrity check: FAILED'); + results.slice(0, 5).forEach(r => console.log(' ' + r)); + process.exitCode = 1; +} + +// FK check +const fkRows = db.pragma('foreign_key_check'); +if (fkRows.length === 0) { + console.log('Foreign key check: PASSED'); +} else { + console.log('Foreign key check: ' + fkRows.length + ' violations'); + process.exitCode = 1; +} + +// Row count +const tables = db.prepare(\"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%'\").all(); +let total = 0; +for (const t of tables) { + try { total += db.prepare('SELECT COUNT(*) as c FROM \"' + t.name + '\"').get().c; } catch(e) {} +} +console.log('Total rows: ' + total); + +db.close(); +" + +# ─── Step 16: Clean up recovery pod ────────────────────────────────────────── +log_step "Cleaning up recovery pod" + +kubectl delete pod "${RECOVERY_POD_NAME}" --wait=false +RECOVERY_POD_CREATED=0 +echo "Recovery pod deleted" + +# Disarm the cleanup trap since we've handled everything +trap - EXIT + +echo "" +echo "=== Recovery Complete ===" +echo "The database has been rebuilt and deployed successfully." +echo "Corrupt backup preserved at: ${CORRUPT_BACKUP_DIR} (on the volume)"