From aa0d41ffb41103187a6be657fe644ad59b94a150 Mon Sep 17 00:00:00 2001 From: vyalamar Date: Sun, 15 Feb 2026 01:55:35 -0800 Subject: [PATCH 1/2] Harden RocksDB snapshot error handling and add troubleshooting doc --- .../docs/content/troubleshooting-rocksdb.md | 52 +++++++++++++++++++ .../rocksdiff/RocksDBCheckpointDiffer.java | 12 ++--- .../om/snapshot/RocksDbPersistentList.java | 19 ++++--- .../om/snapshot/RocksDbPersistentMap.java | 46 +++++++++------- .../om/snapshot/RocksDbPersistentSet.java | 24 ++++++--- .../om/snapshot/SnapshotStorageException.java | 44 ++++++++++++++++ 6 files changed, 155 insertions(+), 42 deletions(-) create mode 100644 hadoop-hdds/docs/content/troubleshooting-rocksdb.md create mode 100644 hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotStorageException.java diff --git a/hadoop-hdds/docs/content/troubleshooting-rocksdb.md b/hadoop-hdds/docs/content/troubleshooting-rocksdb.md new file mode 100644 index 000000000000..2a1655510a5b --- /dev/null +++ b/hadoop-hdds/docs/content/troubleshooting-rocksdb.md @@ -0,0 +1,52 @@ +--- +title: "Troubleshooting RocksDB in Ozone" +weight: 180 +--- + +This page is a quick operator runbook for common RocksDB issues in Ozone components (OM, Recon, Datanode metadata, Snapshot DBs). + +## Quick Checks +- Verify native library: `ozone debug check-native` (loads RocksDB JNI and prints lib path). +- Confirm options in effect: `ozone debug ldb --db= list_column_families` then `ozone debug ldb --db= get_live_files_metadata` to see block size, compression, etc. +- Check open files and max_open_files: `ls -1 /` and compare to `ozone.om.snapshot.rocksdb.max.open.files` / `ozone.om.rocksdb.max.open.files`. + +## Symptom → Action +- **Compaction stalls / write stalls** + - Look for `Stall` and `Slowdown` counters in RocksDBStats metrics and OM logs. + - Run `ozone debug om compaction-log-dag --db=` to inspect compaction DAG; prune if needed with `ozone repair ldb manual-compaction` (stop service first). + - Reduce L0 buildup: increase `target_file_size_base`, tune `level0_slowdown_writes_trigger`, or increase `max_background_jobs` via ini file (`hdds.datanode.metadata.rocksdb.ini` / `ozone.om.rocksdb.ini`). + +- **High latency / iterator scans slow** + - Enable block cache metrics; check `rocksdb.block.cache.*` in JMX. + - Use iterator lower/upper bounds to avoid full DB scans; ensure `ozone.metastore.rocksdb.statistics` is not OFF when debugging. + +- **WAL corruption / cannot open DB** + - Capture error code from RocksDB message; check if WAL replay failed. + - Try `ozone debug ldb --db= check` (read-only); if unrecoverable, restore from last valid checkpoint under snapshot content lock. + +- **Options drift after upgrades** + - Use `ozone tool rocksdb options` (preserves previous options) before upgrades; compare current `OPTIONS-*` files in DB dir. + - Keep ini files checked into config mgmt; reapply on restart. + +- **Checksum errors on SST** + - Identify file via log; run `ozone debug ldb --db= get_live_files_metadata | grep `. + - If isolated, delete snapshot content lock and rebuild from latest checkpoint; otherwise trigger full re-replication/restore. + +## Preventive Settings (per `ozone-default.xml`) +- `ozone.metastore.rocksdb.statistics` – enable StatsLevel for visibility (EXCEPT_DETAILED_TIMERS is a good default). +- `ozone.metastore.rocksdb.cf.write.buffer.size` – tune memtable per CF to match RAM and write rate. +- `ozone.om.snapshot.rocksdb.metrics.enabled` – keep ON for snapshot DBs unless perf testing. +- `hdds.datanode.metadata.rocksdb.cache.size` – size block cache per host for container DBs. + +## Useful Commands +- List CFs: `ozone debug ldb --db= list_column_families` +- Live files metadata: `ozone debug ldb --db= get_live_files_metadata` +- Manual compaction (offline): `ozone repair ldb manual-compaction --db= --column-family=` +- Snapshot diff check DAG: `ozone debug om compaction-log-dag --db=` + +## When to Fall Back +- If compaction DAG traversal throws overflow guard, drop to full diff path and re-create DAG from latest checkpoints. +- If iterator-based diff fails repeatedly, toggle `performNonNativeDiff=true` and collect logs before re-enabling. + +## What to Capture for Bugs +- Component (OM/Recon/Datanode), DB path, RocksDB version (`pom.xml` shows baseline), full stack trace, `OPTIONS-*` files, and last 200 lines of log around failure. diff --git a/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java b/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java index 956a0caac7c7..0f2d1c858614 100644 --- a/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java +++ b/hadoop-hdds/rocksdb-checkpoint-differ/src/main/java/org/apache/ozone/rocksdiff/RocksDBCheckpointDiffer.java @@ -573,8 +573,7 @@ byte[] addToCompactionLogTable(CompactionLogEntry compactionLogEntry) { try { activeRocksDB.get().put(compactionLogTableCFHandle, key, value); } catch (RocksDBException exception) { - // TODO: Revisit exception handling before merging the PR. - throw new RuntimeException(exception); + throw new RocksDBCheckpointDifferException("Failed to persist compaction log entry", exception); } return key; } @@ -968,8 +967,7 @@ synchronized void internalGetSSTDiffList(DifferSnapshotVersion src, DifferSnapsh // Clear output in case of error. Expect fall back to full diff sameFiles.clear(); differentFiles.clear(); - // TODO: Revisit error handling here. Use custom exception? - throw new RuntimeException(errorMsg); + throw new RocksDBCheckpointDifferException(errorMsg); } final Set nextLevel = new HashSet<>(); @@ -1143,8 +1141,7 @@ private synchronized Pair, List> getOlderFileNodes() { } } catch (InvalidProtocolBufferException exception) { - // TODO: Handle this properly before merging the PR. - throw new RuntimeException(exception); + throw new RocksDBCheckpointDifferException("Failed to parse compaction log entry", exception); } return Pair.of(compactionNodes, keysToRemove); } @@ -1156,8 +1153,7 @@ private synchronized void removeKeyFromCompactionLogTable( activeRocksDB.get().delete(compactionLogTableCFHandle, key); } } catch (RocksDBException exception) { - // TODO Handle exception properly before merging the PR. - throw new RuntimeException(exception); + throw new RocksDBCheckpointDifferException("Failed to delete compaction log entries", exception); } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentList.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentList.java index af31a7955681..f433989c2c7e 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentList.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentList.java @@ -55,8 +55,7 @@ public boolean add(E entry) { db.get().put(columnFamilyHandle, rawKey, rawValue); return true; } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("append list entry", toRocks(exception)); } } @@ -75,8 +74,7 @@ public E get(int index) { byte[] rawValue = db.get().get(columnFamilyHandle, rawKey); return codecRegistry.asObject(rawValue, entryType); } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("read list entry", toRocks(exception)); } } @@ -99,8 +97,7 @@ public E next() { try { return codecRegistry.asObject(rawKey, entryType); } catch (IOException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromIO("deserialize list entry", exception); } } @@ -110,4 +107,14 @@ public void close() { } }; } + + private RocksDBException toRocks(Exception e) { + if (e instanceof RocksDBException) { + return (RocksDBException) e; + } + if (e.getCause() instanceof RocksDBException) { + return (RocksDBException) e.getCause(); + } + return new RocksDBException(e); + } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentMap.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentMap.java index 9b80b75eb643..55352a162ac3 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentMap.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentMap.java @@ -60,8 +60,7 @@ public V get(K key) { byte[] rawValue = db.get().get(columnFamilyHandle, rawKey); return codecRegistry.asObject(rawValue, valueType); } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("read map entry", toRocks(exception)); } } @@ -72,8 +71,7 @@ public void put(K key, V value) { byte[] rawValue = codecRegistry.asRawData(value); db.get().put(columnFamilyHandle, rawKey, rawValue); } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("write map entry", toRocks(exception)); } } @@ -83,8 +81,7 @@ public void remove(K key) { byte[] rawKey = codecRegistry.asRawData(key); db.get().delete(columnFamilyHandle, rawKey); } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("delete map entry", toRocks(exception)); } } @@ -111,10 +108,9 @@ public ClosableIterator> iterator(Optional lowerBound, } else { upperBoundSlice = null; } - } catch (IOException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); - } + } catch (IOException exception) { + throw SnapshotStorageException.fromIO("deserialize map entry", exception); + } iterator = ManagedRocksIterator.managed( db.get().newIterator(columnFamilyHandle, readOptions)); @@ -165,16 +161,26 @@ public V setValue(V value) { } @Override - public void close() { - iterator.close(); - readOptions.close(); - if (upperBoundSlice != null) { - upperBoundSlice.close(); - } - if (lowerBoundSlice != null) { - lowerBoundSlice.close(); - } + public void close() { + iterator.close(); + readOptions.close(); + if (upperBoundSlice != null) { + upperBoundSlice.close(); + } + if (lowerBoundSlice != null) { + lowerBoundSlice.close(); } - }; + } + }; +} + + private RocksDBException toRocks(Exception e) { + if (e instanceof RocksDBException) { + return (RocksDBException) e; + } + if (e.getCause() instanceof RocksDBException) { + return (RocksDBException) e.getCause(); + } + return new RocksDBException(e); } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentSet.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentSet.java index 53d15b9f88d1..01fef800df4f 100644 --- a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentSet.java +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/RocksDbPersistentSet.java @@ -52,8 +52,7 @@ public void add(E entry) { byte[] rawValue = codecRegistry.asRawData(emptyByteArray); db.get().put(columnFamilyHandle, rawKey, rawValue); } catch (IOException | RocksDBException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromRocksDB("add set entry", toRocks(exception)); } } @@ -76,15 +75,24 @@ public E next() { try { return codecRegistry.asObject(rawKey, entryType); } catch (IOException exception) { - // TODO: [SNAPSHOT] Fail gracefully. - throw new RuntimeException(exception); + throw SnapshotStorageException.fromIO("deserialize set entry", exception); } } @Override - public void close() { - managedRocksIterator.close(); - } - }; + public void close() { + managedRocksIterator.close(); + } + }; +} + + private RocksDBException toRocks(Exception e) { + if (e instanceof RocksDBException) { + return (RocksDBException) e; + } + if (e.getCause() instanceof RocksDBException) { + return (RocksDBException) e.getCause(); + } + return new RocksDBException(e); } } diff --git a/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotStorageException.java b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotStorageException.java new file mode 100644 index 000000000000..115c254a24f1 --- /dev/null +++ b/hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/snapshot/SnapshotStorageException.java @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.ozone.om.snapshot; + +import java.io.UncheckedIOException; +import org.apache.hadoop.hdds.utils.db.RocksDatabaseException; +import org.rocksdb.RocksDBException; + +/** + * Unchecked wrapper for snapshot metadata store failures. + */ +public class SnapshotStorageException extends RuntimeException { + + public SnapshotStorageException(String message, Exception cause) { + super(message, cause); + } + + public static SnapshotStorageException fromRocksDB(String op, + RocksDBException e) { + return new SnapshotStorageException("Failed to " + op, + new RocksDatabaseException("Failed to " + op, e)); + } + + public static SnapshotStorageException fromIO(String op, + java.io.IOException e) { + return new SnapshotStorageException("Failed to " + op, + new UncheckedIOException(e)); + } +} From 16780085e4d7d25fa6569d21c62f86925343b618 Mon Sep 17 00:00:00 2001 From: vyalamar Date: Sun, 15 Feb 2026 02:00:18 -0800 Subject: [PATCH 2/2] Harden RocksDB snapshot error handling --- .../docs/content/troubleshooting-rocksdb.md | 52 ------------------- 1 file changed, 52 deletions(-) delete mode 100644 hadoop-hdds/docs/content/troubleshooting-rocksdb.md diff --git a/hadoop-hdds/docs/content/troubleshooting-rocksdb.md b/hadoop-hdds/docs/content/troubleshooting-rocksdb.md deleted file mode 100644 index 2a1655510a5b..000000000000 --- a/hadoop-hdds/docs/content/troubleshooting-rocksdb.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: "Troubleshooting RocksDB in Ozone" -weight: 180 ---- - -This page is a quick operator runbook for common RocksDB issues in Ozone components (OM, Recon, Datanode metadata, Snapshot DBs). - -## Quick Checks -- Verify native library: `ozone debug check-native` (loads RocksDB JNI and prints lib path). -- Confirm options in effect: `ozone debug ldb --db= list_column_families` then `ozone debug ldb --db= get_live_files_metadata` to see block size, compression, etc. -- Check open files and max_open_files: `ls -1 /` and compare to `ozone.om.snapshot.rocksdb.max.open.files` / `ozone.om.rocksdb.max.open.files`. - -## Symptom → Action -- **Compaction stalls / write stalls** - - Look for `Stall` and `Slowdown` counters in RocksDBStats metrics and OM logs. - - Run `ozone debug om compaction-log-dag --db=` to inspect compaction DAG; prune if needed with `ozone repair ldb manual-compaction` (stop service first). - - Reduce L0 buildup: increase `target_file_size_base`, tune `level0_slowdown_writes_trigger`, or increase `max_background_jobs` via ini file (`hdds.datanode.metadata.rocksdb.ini` / `ozone.om.rocksdb.ini`). - -- **High latency / iterator scans slow** - - Enable block cache metrics; check `rocksdb.block.cache.*` in JMX. - - Use iterator lower/upper bounds to avoid full DB scans; ensure `ozone.metastore.rocksdb.statistics` is not OFF when debugging. - -- **WAL corruption / cannot open DB** - - Capture error code from RocksDB message; check if WAL replay failed. - - Try `ozone debug ldb --db= check` (read-only); if unrecoverable, restore from last valid checkpoint under snapshot content lock. - -- **Options drift after upgrades** - - Use `ozone tool rocksdb options` (preserves previous options) before upgrades; compare current `OPTIONS-*` files in DB dir. - - Keep ini files checked into config mgmt; reapply on restart. - -- **Checksum errors on SST** - - Identify file via log; run `ozone debug ldb --db= get_live_files_metadata | grep `. - - If isolated, delete snapshot content lock and rebuild from latest checkpoint; otherwise trigger full re-replication/restore. - -## Preventive Settings (per `ozone-default.xml`) -- `ozone.metastore.rocksdb.statistics` – enable StatsLevel for visibility (EXCEPT_DETAILED_TIMERS is a good default). -- `ozone.metastore.rocksdb.cf.write.buffer.size` – tune memtable per CF to match RAM and write rate. -- `ozone.om.snapshot.rocksdb.metrics.enabled` – keep ON for snapshot DBs unless perf testing. -- `hdds.datanode.metadata.rocksdb.cache.size` – size block cache per host for container DBs. - -## Useful Commands -- List CFs: `ozone debug ldb --db= list_column_families` -- Live files metadata: `ozone debug ldb --db= get_live_files_metadata` -- Manual compaction (offline): `ozone repair ldb manual-compaction --db= --column-family=` -- Snapshot diff check DAG: `ozone debug om compaction-log-dag --db=` - -## When to Fall Back -- If compaction DAG traversal throws overflow guard, drop to full diff path and re-create DAG from latest checkpoints. -- If iterator-based diff fails repeatedly, toggle `performNonNativeDiff=true` and collect logs before re-enabling. - -## What to Capture for Bugs -- Component (OM/Recon/Datanode), DB path, RocksDB version (`pom.xml` shows baseline), full stack trace, `OPTIONS-*` files, and last 200 lines of log around failure.