Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions notes/2026-01-28_1_db-scripts-restructure.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Database Maintenance Scripts Restructure

## What changed
Restructured 4 db maintenance scripts into 3 scripts (plus a shared common file) that minimize downtime during recovery.

### Before (4 scripts)
- `db-integrity.sh` — local-only, required sqlite3 CLI and a local DB file
- `db-backup.sh` — raw `kubectl cp`, copied WAL/SHM separately (inconsistent state risk)
- `db-rebuild.sh` — local-only .recover/.dump
- `db-deploy.sh` — uploaded rebuilt DB to volume via temp pod (slow network transfer of full DB)

### After (3 scripts + shared)
- `db-common.sh` — shared constants and utilities (sourced by all scripts)
- `db-integrity.sh` — remote integrity check via `kubectl exec` + `node -e` with `better-sqlite3` (readonly, no downtime)
- `db-backup.sh` — consistent backup via `better-sqlite3`'s `.backup()` API (single consistent file, no WAL/SHM needed)
- `db-recover.sh` — full pipeline: recovery pod → rebuild on volume → deploy (no large network transfers)

## Key design decisions

### `node -e` with `better-sqlite3` instead of sqlite3 CLI
The production image doesn't have sqlite3 CLI but does have better-sqlite3 (it's a dependency). Using `node -e` for remote operations avoids needing to install anything on the production pod.

### Recovery pod approach
- Uses `alpine` + `apk add sqlite` for the recovery pod (needs sqlite3 CLI for .recover/.dump)
- RWO PVC constraint means: create recovery pod first (stays Pending) → scale down production (frees PVC) → recovery pod becomes Ready
- All I/O stays on the volume — no downloading/uploading the full DB over the network

### PVC name
Confirmed PVC name is `mod-bot-pvc-mod-bot-set-0` (was wrong in old db-deploy.sh as `data-mod-bot-set-0`).

### Cleanup trap ordering
Recovery pod must be deleted BEFORE scaling StatefulSet back up, because the recovery pod holds the RWO PVC. If StatefulSet tries to schedule while recovery pod has the PVC, the new pod will be stuck Pending.

## Constants (in db-common.sh)
```
STATEFULSET_NAME="mod-bot-set"
POD_NAME="mod-bot-set-0"
PVC_NAME="mod-bot-pvc-mod-bot-set-0"
REMOTE_DB_PATH="/data/mod-bot.sqlite3"
RECOVERY_POD_NAME="db-recovery-temp"
```
71 changes: 71 additions & 0 deletions scripts/db-backup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/bin/bash
# Backup the production database using better-sqlite3's backup API.
# Produces a single consistent file without needing to copy WAL/SHM.
#
# Usage: ./scripts/db-backup.sh [destination]

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "${SCRIPT_DIR}/db-common.sh"

check_kubectl
check_sqlite3_local

REMOTE_BACKUP_PATH="/data/mod-bot-backup-tmp.sqlite3"

# Determine destination path
if [[ $# -ge 1 ]]; then
DESTINATION="$1"
else
TIMESTAMP=$(date +"%Y-%m-%d_%H%M%S")
DESTINATION="./mod-bot-backup-${TIMESTAMP}.sqlite3"
fi

# Cleanup: remove temp file from pod on exit
cleanup() {
echo "Cleaning up temporary backup on pod..."
kubectl exec "${POD_NAME}" -- rm -f "${REMOTE_BACKUP_PATH}" 2>/dev/null || true
}
trap cleanup EXIT

echo "Backing up database from ${POD_NAME}"
echo " Source: ${POD_NAME}:${REMOTE_DB_PATH}"
echo " Destination: ${DESTINATION}"
echo ""

log_step "Creating consistent backup on pod via better-sqlite3"
kubectl exec "${POD_NAME}" -- node -e "
const Database = require('better-sqlite3');
const db = new Database('${REMOTE_DB_PATH}', { readonly: true });
db.backup('${REMOTE_BACKUP_PATH}')
.then(() => {
db.close();
console.log('Backup created successfully on pod');
})
.catch(err => {
db.close();
console.error('Backup failed: ' + err.message);
process.exit(1);
});
"

log_step "Downloading backup to local machine"
kubectl cp "${POD_NAME}:${REMOTE_BACKUP_PATH}" "${DESTINATION}"

FILE_SIZE=$(ls -lh "${DESTINATION}" | awk '{print $5}')
echo " File: ${DESTINATION}"
echo " Size: ${FILE_SIZE}"

log_step "Running local integrity check"
INTEGRITY=$(sqlite3 "${DESTINATION}" "PRAGMA quick_check;" 2>&1)
if [[ "${INTEGRITY}" == "ok" ]]; then
echo " Integrity: PASSED"
else
echo " Integrity: ISSUES DETECTED"
echo " Run ./scripts/db-integrity.sh for details on the production database"
echo " The backup may reflect pre-existing corruption in the source."
fi

echo ""
echo "Backup complete: ${DESTINATION}"
35 changes: 35 additions & 0 deletions scripts/db-common.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Shared constants and utilities for database maintenance scripts.
# Source this file: source "$(dirname "$0")/db-common.sh"

STATEFULSET_NAME="mod-bot-set"
POD_NAME="mod-bot-set-0"
PVC_NAME="mod-bot-pvc-mod-bot-set-0"
REMOTE_DB_PATH="/data/mod-bot.sqlite3"
RECOVERY_POD_NAME="db-recovery-temp"

STEP_COUNTER=0

log_step() {
STEP_COUNTER=$((STEP_COUNTER + 1))
echo ""
echo "=== Step ${STEP_COUNTER}: $1 ==="
}

check_kubectl() {
if ! command -v kubectl &>/dev/null; then
echo "Error: kubectl is not installed or not in PATH" >&2
exit 1
fi
if ! kubectl cluster-info &>/dev/null; then
echo "Error: Cannot connect to Kubernetes cluster" >&2
exit 1
fi
}

check_sqlite3_local() {
if ! command -v sqlite3 &>/dev/null; then
echo "Error: sqlite3 is not installed or not in PATH" >&2
exit 1
fi
}
129 changes: 129 additions & 0 deletions scripts/db-integrity.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
#!/bin/bash
# Run integrity checks on the production database via kubectl exec.
# Read-only, non-invasive. Runs against the live pod using better-sqlite3.
#
# Usage: ./scripts/db-integrity.sh

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
source "${SCRIPT_DIR}/db-common.sh"

check_kubectl

echo "Database Integrity Report (Remote)"
echo "Pod: ${POD_NAME}"
echo "Database: ${REMOTE_DB_PATH}"
echo "Date: $(date)"
echo ""

# Run all checks in a single node -e invocation to minimize kubectl exec overhead.
# The node script outputs formatted text and exits with code 1 if issues are found.
kubectl exec "${POD_NAME}" -- node -e "
const Database = require('better-sqlite3');
let db;
try {
db = new Database('${REMOTE_DB_PATH}', { readonly: true });
} catch (e) {
console.log('Error: Could not open database: ' + e.message);
process.exit(1);
}

let issues = 0;

// 1. quick_check
console.log('1. PRAGMA quick_check');
try {
const rows = db.pragma('quick_check');
const results = rows.map(r => r.quick_check);
if (results.length === 1 && results[0] === 'ok') {
console.log(' Status: PASSED');
} else {
console.log(' Status: FAILED');
console.log(' Details:');
results.slice(0, 10).forEach(r => console.log(' ' + r));
if (results.length > 10) console.log(' ... (' + results.length + ' total issues)');
issues++;
}
} catch (e) {
console.log(' Status: ERROR');
console.log(' ' + e.message);
issues++;
}

// 2. foreign_key_check
console.log('');
console.log('2. PRAGMA foreign_key_check');
try {
const fkRows = db.pragma('foreign_key_check');
if (fkRows.length === 0) {
console.log(' Status: PASSED (no violations)');
} else {
console.log(' Status: FAILED');
console.log(' Violations: ' + fkRows.length);
fkRows.slice(0, 10).forEach(v => console.log(' ' + JSON.stringify(v)));
if (fkRows.length > 10) console.log(' ... (truncated)');
issues++;
}
} catch (e) {
console.log(' Status: ERROR');
console.log(' ' + e.message);
issues++;
}

// 3. Table row counts
console.log('');
console.log('3. Table Row Counts');
try {
const tables = db.prepare(\"SELECT name FROM sqlite_master WHERE type='table' AND name NOT LIKE 'sqlite_%' ORDER BY name\").all();
let total = 0;
const entries = [];
for (const t of tables) {
try {
const row = db.prepare('SELECT COUNT(*) as c FROM \"' + t.name + '\"').get();
entries.push([t.name, String(row.c)]);
total += row.c;
} catch (e) {
entries.push([t.name, 'ERROR']);
}
}
const maxName = Math.max(...entries.map(e => e[0].length), 5);
const maxCount = Math.max(...entries.map(e => e[1].length), 5);
entries.forEach(([name, count]) => {
console.log(' ' + name.padEnd(maxName + 2) + count.padStart(maxCount));
});
console.log(' ' + '---'.padEnd(maxName + 2) + '---'.padStart(maxCount));
console.log(' ' + 'TOTAL'.padEnd(maxName + 2) + String(total).padStart(maxCount));
} catch (e) {
console.log(' Error: ' + e.message);
}

// 4. DB config
console.log('');
console.log('4. Database Configuration');
try {
console.log(' Journal mode: ' + db.pragma('journal_mode')[0].journal_mode);
console.log(' Page size: ' + db.pragma('page_size')[0].page_size);
console.log(' Page count: ' + db.pragma('page_count')[0].page_count);
console.log(' Freelist count: ' + db.pragma('freelist_count')[0].freelist_count);
} catch (e) {
console.log(' Error: ' + e.message);
}

db.close();

// 5. Overall health
console.log('');
console.log('5. Overall Health Status');
if (issues === 0) {
console.log(' Status: HEALTHY');
console.log(' The database appears to be in good condition.');
} else {
console.log(' Status: ISSUES DETECTED');
console.log(' Review the findings above. Consider running:');
console.log(' ./scripts/db-recover.sh');
}
console.log('');

process.exit(issues > 0 ? 1 : 0);
"
Loading