Permalink
Please sign in to comment.
Showing
with
20 additions
and 9,567 deletions.
- +6 −0 .gitmodules
- +8 −7 Makefile
- +1 −1 crates/binutils
- +1 −1 crates/coreutils
- +1 −0 crates/extra
- +1 −1 crates/extrautils
- +1 −1 crates/games
- +1 −0 crates/zfs
- +0 −126 crates/zfs/arcache.rs
- +0 −333 crates/zfs/avl.rs
- +0 −56 crates/zfs/block_ptr.rs
- +0 −1,935 crates/zfs/dmu_objset.rs
- +0 −85 crates/zfs/dnode.rs
- +0 −92 crates/zfs/dsl_dataset.rs
- +0 −37 crates/zfs/dsl_dir.rs
- +0 −17 crates/zfs/dsl_pool.rs
- +0 −42 crates/zfs/dvaddr.rs
- +0 −16 crates/zfs/from_bytes.rs
- +0 −147 crates/zfs/lzjb.rs
- +0 −621 crates/zfs/main.rs
- +0 −587 crates/zfs/metaslab.rs
- +0 −385 crates/zfs/nvpair.rs
- +0 −266 crates/zfs/nvstream.rs
- +0 −319 crates/zfs/spa.rs
- +0 −207 crates/zfs/space_map.rs
- +0 −371 crates/zfs/taskq.rs
- +0 −5 crates/zfs/txg.rs
- +0 −47 crates/zfs/uberblock.rs
- +0 −74 crates/zfs/util.rs
- +0 −506 crates/zfs/vdev.rs
- +0 −34 crates/zfs/vdev_file.rs
- +0 −1,011 crates/zfs/vdev_label.rs
- +0 −682 crates/zfs/vdev_queue.rs
- +0 −145 crates/zfs/xdr/mem_ops.rs
- +0 −5 crates/zfs/xdr/mod.rs
- +0 −219 crates/zfs/xdr/xdr.rs
- +0 −190 crates/zfs/zap.rs
- +0 −38 crates/zfs/zfs.rs
- +0 −8 crates/zfs/zil_header.rs
- +0 −950 crates/zfs/zio.rs
6
.gitmodules
15
Makefile
2
crates/binutils
@@ -1 +1 @@ | ||
-Subproject commit 3232642e98433e882148f296416023e1f22b9bda | ||
+Subproject commit 5599724eab8b28705f6b2b66145fdcb7e4ce2d4d |
2
crates/coreutils
@@ -1 +1 @@ | ||
-Subproject commit 41eef0ff8a18f0011f373e6b78fb199a25ef2926 | ||
+Subproject commit 3a666f3b7ddf682c342363d9583b887a360d3dab |
1
crates/extra
@@ -0,0 +1 @@ | ||
+Subproject commit dd01a09283df73e8e62a6fa59ede41897459dcbd |
2
crates/extrautils
@@ -1 +1 @@ | ||
-Subproject commit 90e803f249803d93e081b71c553177c4befd6f18 | ||
+Subproject commit b1ebde2e1a5e3cac977d076d4df04b7a76f06ff3 |
2
crates/games
@@ -1 +1 @@ | ||
-Subproject commit 98ffb8e0a2c471252e5a922f8dd6a335388d7a10 | ||
+Subproject commit eb52fcb69b59957bd3bfdf6a6ff37234788bf521 |
1
crates/zfs
@@ -0,0 +1 @@ | ||
+Subproject commit 066a57daef9f86c59018867d904e9fb15d3ddec7 |
126
crates/zfs/arcache.rs
@@ -1,126 +0,0 @@ | ||
-use std::collections::{BTreeMap, VecDeque}; | ||
- | ||
-use super::dvaddr::DVAddr; | ||
-use super::zio; | ||
- | ||
-/// MRU - Most Recently Used cache | ||
-struct Mru { | ||
- map: BTreeMap<DVAddr, Vec<u8>>, | ||
- queue: VecDeque<DVAddr>, // Oldest DVAddrs are at the end | ||
- size: usize, // Max mru cache size in blocks | ||
- used: usize, // Number of used blocks in mru cache | ||
-} | ||
- | ||
-impl Mru { | ||
- pub fn new() -> Self { | ||
- Mru { | ||
- map: BTreeMap::new(), | ||
- queue: VecDeque::new(), | ||
- size: 1000, | ||
- used: 0, | ||
- } | ||
- } | ||
- | ||
- pub fn cache_block(&mut self, dva: &DVAddr, block: Vec<u8>) -> Result<Vec<u8>, String> { | ||
- // If necessary, make room for the block in the cache | ||
- while self.used + (dva.asize() as usize) > self.size { | ||
- let last_dva = match self.queue.pop_back() { | ||
- Some(dva) => dva, | ||
- None => return Err("No more ARC MRU items to free".to_string()), | ||
- }; | ||
- self.map.remove(&last_dva); | ||
- self.used -= last_dva.asize() as usize; | ||
- } | ||
- | ||
- // Add the block to the cache | ||
- self.used += dva.asize() as usize; | ||
- self.map.insert(*dva, block); | ||
- self.queue.push_front(*dva); | ||
- Ok(self.map.get(dva).unwrap().clone()) | ||
- } | ||
-} | ||
- | ||
-/// MFU - Most Frequently Used cache | ||
-struct Mfu { | ||
- // TODO: Keep track of use counts. So mfu_map becomes (use_count: u64, Vec<u8>). Reset the use | ||
- // count every once in a while. For instance, every 1000 reads. This will probably end up being | ||
- // a knob for the user. | ||
- // TODO: Keep track of minimum frequency and corresponding DVA | ||
- map: BTreeMap<DVAddr, (u64, Vec<u8>)>, | ||
- size: usize, // Max mfu cache size in blocks | ||
- used: usize, // Number of used bytes in mfu cache | ||
-} | ||
- | ||
-impl Mfu { | ||
- pub fn new() -> Self { | ||
- Mfu { | ||
- map: BTreeMap::new(), | ||
- size: 1000, | ||
- used: 0, | ||
- } | ||
- } | ||
- | ||
- pub fn cache_block(&mut self, dva: &DVAddr, block: Vec<u8>) -> Result<Vec<u8>, String> { | ||
- { | ||
- let mut lowest_freq = ::std::u64::MAX; | ||
- let mut lowest_dva: Result<DVAddr, String> = Err("No valid DVA found.".to_string()); | ||
- | ||
- for (&dva_key, &(freq, _)) in self.map.iter() { | ||
- if freq < lowest_freq { | ||
- lowest_freq = freq; | ||
- lowest_dva = Ok(dva_key); | ||
- } | ||
- } | ||
- | ||
- self.map.remove(&try!(lowest_dva)); | ||
- } | ||
- | ||
- // Add the block to the cache | ||
- self.used += dva.asize() as usize; | ||
- self.map.insert(*dva, (2, block)); | ||
- Ok(self.map.get(dva).unwrap().1.clone()) | ||
- } | ||
-} | ||
- | ||
-// Our implementation of the Adaptive Replacement Cache (ARC) is set up to allocate | ||
-// its buffer on the heap rather than in a private pool thing. This makes it much | ||
-// simpler to implement, but defers the fragmentation problem to the heap allocator. | ||
-// We named the type `ArCache` to avoid confusion with Rust's `Arc` reference type. | ||
-pub struct ArCache { | ||
- mru: Mru, | ||
- mfu: Mfu, | ||
-} | ||
- | ||
-impl ArCache { | ||
- pub fn new() -> Self { | ||
- ArCache { | ||
- mru: Mru::new(), | ||
- mfu: Mfu::new(), | ||
- } | ||
- } | ||
- | ||
- pub fn read(&mut self, reader: &mut zio::Reader, dva: &DVAddr) -> Result<Vec<u8>, String> { | ||
- if let Some(block) = self.mru.map.remove(dva) { | ||
- self.mfu.map.insert(*dva, (0, block.clone())); | ||
- | ||
- // Block is cached | ||
- return Ok(block); | ||
- } | ||
- if let Some(block) = self.mfu.map.get_mut(dva) { | ||
- // Block is cached | ||
- if block.0 > 1000 { | ||
- block.0 = 0; | ||
- } else { | ||
- block.0 += 1; | ||
- } | ||
- | ||
- return Ok(block.1.clone()); | ||
- } | ||
- | ||
- // Block isn't cached, have to read it from disk | ||
- let block = reader.read(dva.sector() as usize, dva.asize() as usize); | ||
- | ||
- // Blocks start in MRU cache | ||
- self.mru.cache_block(dva, block) | ||
- } | ||
-} |
333
crates/zfs/avl.rs
@@ -1,333 +0,0 @@ | ||
-use std::rc::Rc; | ||
- | ||
-pub struct Node<T> { | ||
- value: T, | ||
- left: Option<usize>, // ID for left node | ||
- right: Option<usize>, // ID for right node | ||
-} | ||
- | ||
-impl<T> Node<T> { | ||
- pub fn value(&self) -> &T { | ||
- &self.value | ||
- } | ||
- pub fn left<K>(&self, tree: &Tree<T, K>) -> Option<NodeId> { | ||
- self.left.map(|l| { | ||
- NodeId { | ||
- index: l, | ||
- time_stamp: tree.nodes[l].time_stamp, | ||
- } | ||
- }) | ||
- } | ||
- pub fn right<K>(&self, tree: &Tree<T, K>) -> Option<NodeId> { | ||
- self.right.map(|r| { | ||
- NodeId { | ||
- index: r, | ||
- time_stamp: tree.nodes[r].time_stamp, | ||
- } | ||
- }) | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-#[derive(Copy, Clone)] | ||
-pub struct NodeId { | ||
- index: usize, | ||
- time_stamp: u64, | ||
-} | ||
- | ||
-impl NodeId { | ||
- pub fn get<'a, T, K>(&self, avl: &'a Tree<T, K>) -> &'a Node<T> { | ||
- let ref slot = avl.nodes[self.index]; | ||
- if slot.time_stamp == self.time_stamp { | ||
- slot.node.as_ref().unwrap() | ||
- } else { | ||
- panic!("NodeId had invalid time_stamp"); | ||
- } | ||
- } | ||
- | ||
- pub fn try_get<'a, T, K>(&self, avl: &'a Tree<T, K>) -> Option<&'a Node<T>> { | ||
- avl.nodes | ||
- .get(self.index) | ||
- .and_then(|slot| { | ||
- if slot.time_stamp == self.time_stamp { | ||
- slot.node.as_ref() | ||
- } else { | ||
- None | ||
- } | ||
- }) | ||
- } | ||
- | ||
- pub fn get_mut<'a, T, K>(&self, avl: &'a mut Tree<T, K>) -> &'a mut Node<T> { | ||
- let ref mut slot = avl.nodes[self.index]; | ||
- if slot.time_stamp == self.time_stamp { | ||
- slot.node.as_mut().unwrap() | ||
- } else { | ||
- panic!("NodeId had invalid time_stamp"); | ||
- } | ||
- } | ||
- | ||
- pub fn try_get_mut<'a, T, K>(&self, avl: &'a mut Tree<T, K>) -> Option<&'a mut Node<T>> { | ||
- avl.nodes | ||
- .get_mut(self.index) | ||
- .and_then(|slot| { | ||
- if slot.time_stamp == self.time_stamp { | ||
- slot.node.as_mut() | ||
- } else { | ||
- None | ||
- } | ||
- }) | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct Tree<T, K> { | ||
- root: Option<usize>, // Index of the root node | ||
- nodes: Vec<Slot<T>>, | ||
- free_list: Vec<usize>, | ||
- key: Rc<Fn(&T) -> K>, | ||
-} | ||
- | ||
-impl<T, K: PartialOrd> Tree<T, K> { | ||
- pub fn new(key: Rc<Fn(&T) -> K>) -> Self { | ||
- Tree { | ||
- root: None, | ||
- nodes: Vec::new(), | ||
- free_list: Vec::new(), | ||
- key: key, | ||
- } | ||
- } | ||
- | ||
- // Inserts a value into the tree, keeping it balanced. Lesser values will be stored on | ||
- // the left, while greater values will be stored on the right. No duplicates are allowed. | ||
- pub fn insert(&mut self, value: T) { | ||
- let root = self.root; | ||
- self.root = Some(self._insert(value, root)); | ||
- } | ||
- | ||
- pub fn in_order<F: Fn(&Node<T>)>(&self, f: F) { | ||
- if let Some(root) = self.root { | ||
- self._in_order(&f, root); | ||
- } | ||
- } | ||
- | ||
- /// Good ol' binary search. Returns immutable reference | ||
- pub fn find(&self, key: K) -> Option<&T> { | ||
- let root = self.root; | ||
- self._find(key, root) | ||
- } | ||
- | ||
- /// Good ol' binary search. Returns a mutable reference | ||
- pub fn find_mut(&mut self, key: K) -> Option<&mut T> { | ||
- let root = self.root; | ||
- self._find_mut(key, root) | ||
- } | ||
- | ||
- // Implementation of insert | ||
- fn _insert(&mut self, value: T, node: Option<usize>) -> usize { | ||
- let node = match node { | ||
- Some(node) => { | ||
- // Node exists, check which way to branch. | ||
- if (self.key)(&value) == (self.key)(&self.node(node).value) { | ||
- return node; | ||
- } else if (self.key)(&value) < (self.key)(&self.node(node).value) { | ||
- let l = self.node(node).left; | ||
- self.node_mut(node).left = Some(self._insert(value, l)); | ||
- } else if (self.key)(&value) > (self.key)(&self.node(node).value) { | ||
- let r = self.node(node).right; | ||
- self.node_mut(node).right = Some(self._insert(value, r)); | ||
- } | ||
- | ||
- node | ||
- } | ||
- None => { | ||
- // The node doesn't exist, create it here. | ||
- self.allocate_node(value) | ||
- } | ||
- }; | ||
- | ||
- self.rebalance(node) | ||
- } | ||
- | ||
- pub fn _in_order<F: Fn(&Node<T>)>(&self, f: &F, node: usize) { | ||
- if let Some(l) = self.node(node).left { | ||
- self._in_order(f, l); | ||
- } | ||
- f(self.node(node)); | ||
- if let Some(r) = self.node(node).right { | ||
- self._in_order(f, r); | ||
- } | ||
- } | ||
- | ||
- pub fn _find(&self, key: K, node: Option<usize>) -> Option<&T> { | ||
- node.and_then(|n| { | ||
- if (self.key)(&self.node(n).value) < key { | ||
- let left = self.node(n).left; | ||
- self._find(key, left) | ||
- } else if (self.key)(&self.node(n).value) > key { | ||
- let right = self.node(n).right; | ||
- self._find(key, right) | ||
- } else { | ||
- // Found it! | ||
- Some(&self.node(n).value) | ||
- } | ||
- }) | ||
- } | ||
- | ||
- pub fn _find_mut(&mut self, key: K, node: Option<usize>) -> Option<&mut T> { | ||
- match node { | ||
- Some(n) => { | ||
- if (self.key)(&self.node(n).value) < key { | ||
- let left = self.node(n).left; | ||
- self._find_mut(key, left) | ||
- } else if (self.key)(&self.node(n).value) > key { | ||
- let right = self.node(n).right; | ||
- self._find_mut(key, right) | ||
- } else { | ||
- // Found it! | ||
- Some(&mut self.node_mut(n).value) | ||
- } | ||
- } | ||
- None => None, | ||
- } | ||
- } | ||
- | ||
- // Performs a left rotation on a tree/subtree. | ||
- // Returns the replace the specified node with | ||
- fn rotate_left(&mut self, node: usize) -> usize { | ||
- // Keep track of the original node positions | ||
- // For a rotate left, the right child node must exist | ||
- let r = self.node(node).right.unwrap(); | ||
- let rl = self.node(r).left; | ||
- | ||
- let ret = r; | ||
- self.node_mut(node).right = rl; | ||
- self.node_mut(ret).left = Some(node); | ||
- | ||
- ret | ||
- } | ||
- | ||
- // Performs a right rotation on a tree/subtree. | ||
- // Returns the replace the specified node with | ||
- fn rotate_right(&mut self, node: usize) -> usize { | ||
- // Keep track of the original node positions | ||
- // For a rotate right, the left child node must exist | ||
- let l = self.node(node).left.unwrap(); | ||
- let lr = self.node(l).right; | ||
- | ||
- let ret = l; | ||
- self.node_mut(node).left = lr; | ||
- self.node_mut(ret).right = Some(node); | ||
- | ||
- ret | ||
- } | ||
- | ||
- // Performs a left-right double rotation on a tree/subtree. | ||
- fn rotate_leftright(&mut self, node: usize) -> usize { | ||
- let l = self.node(node).left.unwrap(); | ||
- let new_l = self.rotate_left(l); // Left node needs to exist | ||
- self.node_mut(node).left = Some(new_l); | ||
- self.rotate_right(node) | ||
- } | ||
- | ||
- // Performs a right-left double rotation on a tree/subtree. | ||
- fn rotate_rightleft(&mut self, node: usize) -> usize { | ||
- let r = self.node(node).right.unwrap(); | ||
- let new_r = self.rotate_right(r); // Right node needs to exist | ||
- self.node_mut(node).right = Some(new_r); | ||
- self.rotate_left(node) | ||
- } | ||
- | ||
- // Rebalances the provided node and returns the node to replace it with if rotations | ||
- // occur | ||
- fn rebalance(&mut self, node: usize) -> usize { | ||
- let balance = self.height(self.node(node).left) - self.height(self.node(node).right); | ||
- if balance == 2 { | ||
- // left | ||
- let lbalance = self.height(self.node(self.node(node).left.unwrap()).left) - | ||
- self.height(self.node(self.node(node).left.unwrap()).right); | ||
- if lbalance == 0 || lbalance == 1 { | ||
- // left left - need to rotate right | ||
- return self.rotate_right(node); | ||
- } else if lbalance == -1 { | ||
- // left right | ||
- return self.rotate_leftright(node); // function name is just a coincidence | ||
- } | ||
- } else if balance == -2 { | ||
- // right | ||
- let rbalance = self.height(self.node(self.node(node).right.unwrap()).left) - | ||
- self.height(self.node(self.node(node).right.unwrap()).right); | ||
- if rbalance == 1 { | ||
- // right left | ||
- return self.rotate_rightleft(node); // function name is just a coincidence | ||
- } else if rbalance == 0 || rbalance == -1 { | ||
- // right right - need to rotate left | ||
- return self.rotate_left(node); | ||
- } | ||
- } | ||
- | ||
- node | ||
- } | ||
- | ||
- // height gets the height of a tree or subtree | ||
- fn height(&self, node: Option<usize>) -> i64 { | ||
- match node { | ||
- Some(node) => { | ||
- let left_height = self.height(self.node(node).left); | ||
- let right_height = self.height(self.node(node).right); | ||
- | ||
- if left_height > right_height { | ||
- left_height + 1 | ||
- } else { | ||
- right_height + 1 | ||
- } | ||
- } | ||
- None => -1, | ||
- } | ||
- } | ||
- | ||
- fn allocate_node(&mut self, value: T) -> usize { | ||
- match self.free_list.pop() { | ||
- Some(index) => { | ||
- self.nodes[index].time_stamp += 1; | ||
- index | ||
- } | ||
- None => { | ||
- // No free slots, create a new one | ||
- let index = self.nodes.len(); | ||
- self.nodes.push(Slot { | ||
- time_stamp: 0, | ||
- node: Some(Node { | ||
- value: value, | ||
- left: None, | ||
- right: None, | ||
- }), | ||
- }); | ||
- index | ||
- } | ||
- } | ||
- } | ||
- | ||
- fn free_node(&mut self, index: usize) -> Node<T> { | ||
- self.free_list.push(index); | ||
- | ||
- // NOTE: We unwrap here, because we trust that `id` points to a valid node, because | ||
- // only we can create and free Nodes and their NodeIds | ||
- self.nodes[index].node.take().unwrap() | ||
- } | ||
- | ||
- fn node(&self, index: usize) -> &Node<T> { | ||
- self.nodes[index].node.as_ref().unwrap() | ||
- } | ||
- | ||
- fn node_mut(&mut self, index: usize) -> &mut Node<T> { | ||
- self.nodes[index].node.as_mut().unwrap() | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-struct Slot<T> { | ||
- time_stamp: u64, | ||
- node: Option<Node<T>>, | ||
-} |
56
crates/zfs/block_ptr.rs
@@ -1,56 +0,0 @@ | ||
-use super::from_bytes::FromBytes; | ||
-use super::dvaddr::DVAddr; | ||
- | ||
-#[derive(Copy, Clone, Debug)] | ||
-#[repr(packed)] | ||
-pub struct BlockPtr { | ||
- pub dvas: [DVAddr; 3], | ||
- pub flags_size: u64, | ||
- pub padding: [u64; 3], | ||
- pub birth_txg: u64, | ||
- pub fill_count: u64, | ||
- pub checksum: [u64; 4], | ||
-} | ||
- | ||
-impl BlockPtr { | ||
- pub fn level(&self) -> u64 { | ||
- (self.flags_size >> 56) & 0x7F | ||
- } | ||
- | ||
- pub fn object_type(&self) -> u64 { | ||
- (self.flags_size >> 48) & 0xFF | ||
- } | ||
- | ||
- pub fn checksum(&self) -> u64 { | ||
- (self.flags_size >> 40) & 0xFF | ||
- } | ||
- | ||
- pub fn compression(&self) -> u64 { | ||
- (self.flags_size >> 32) & 0xFF | ||
- } | ||
- | ||
- pub fn lsize(&self) -> u64 { | ||
- (self.flags_size & 0xFFFF) + 1 | ||
- } | ||
- | ||
- pub fn psize(&self) -> u64 { | ||
- ((self.flags_size >> 16) & 0xFFFF) + 1 | ||
- } | ||
-} | ||
- | ||
-impl FromBytes for BlockPtr {} | ||
- | ||
-#[derive(Copy, Clone, Debug)] | ||
-#[repr(packed)] | ||
-pub struct Gang { | ||
- pub bps: [BlockPtr; 3], | ||
- pub padding: [u64; 14], | ||
- pub magic: u64, | ||
- pub checksum: u64, | ||
-} | ||
- | ||
-impl Gang { | ||
- pub fn magic() -> u64 { | ||
- return 0x117a0cb17ada1002; | ||
- } | ||
-} |
1,935
crates/zfs/dmu_objset.rs
0 additions,
1,935 deletions
not shown because the diff is too large. Please use a local Git client to view these changes.
85
crates/zfs/dnode.rs
@@ -1,85 +0,0 @@ | ||
-use std::fmt; | ||
-use std::mem; | ||
- | ||
-use super::block_ptr::BlockPtr; | ||
-use super::from_bytes::FromBytes; | ||
-use super::zil_header::ZilHeader; | ||
- | ||
-#[repr(u8)] | ||
-#[derive(Debug, Eq, PartialEq)] | ||
-pub enum ObjectType { | ||
- None, | ||
- ObjectDirectory, | ||
- ObjectArray, | ||
- PackedNvList, | ||
- NvListSize, | ||
- BlockPtrList, | ||
- BlockPtrListHdr, | ||
- SpaceMapHeader, | ||
- SpaceMap, | ||
- IntentLog, | ||
- DNode, | ||
- ObjSet, | ||
- DataSet, | ||
- DataSetChildMap, | ||
- ObjSetSnapMap, | ||
- DslProps, | ||
- DslObjSet, | ||
- ZNode, | ||
- Acl, | ||
- PlainFileContents, | ||
- DirectoryContents, | ||
- MasterNode, | ||
- DeleteQueue, | ||
- ZVol, | ||
- ZVolProp, | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-pub struct DNodePhys { | ||
- pub object_type: ObjectType, | ||
- pub indblkshift: u8, // ln2(indirect block size) | ||
- pub nlevels: u8, // 1=blkptr->data blocks | ||
- pub nblkptr: u8, // length of blkptr | ||
- pub bonus_type: u8, // type of data in bonus buffer | ||
- pub checksum: u8, // ZIO_CHECKSUM type | ||
- pub compress: u8, // ZIO_COMPRESS type | ||
- pub flags: u8, // DNODE_FLAG_* | ||
- pub data_blk_sz_sec: u16, // data block size in 512b sectors | ||
- pub bonus_len: u16, // length of bonus | ||
- pub pad2: [u8; 4], | ||
- | ||
- // accounting is protected by dirty_mtx | ||
- pub maxblkid: u64, // largest allocated block ID | ||
- pub used: u64, // bytes (or sectors) of disk space | ||
- | ||
- pub pad3: [u64; 4], | ||
- | ||
- blkptr_bonus: [u8; 448], | ||
-} | ||
- | ||
-impl DNodePhys { | ||
- pub fn get_blockptr<'a>(&self, i: usize) -> &'a BlockPtr { | ||
- unsafe { mem::transmute(&self.blkptr_bonus[i * 128]) } | ||
- } | ||
- | ||
- pub fn get_bonus(&self) -> &[u8] { | ||
- &self.blkptr_bonus[(self.nblkptr as usize) * 128..] | ||
- } | ||
-} | ||
- | ||
-impl FromBytes for DNodePhys {} | ||
- | ||
-impl fmt::Debug for DNodePhys { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- try!(write!(f, | ||
- "DNodePhys {{ object_type: {:?}, nlevels: {:X}, nblkptr: {:X}, bonus_type: \ | ||
- {:X}, bonus_len: {:X}}}\n", | ||
- self.object_type, | ||
- self.nlevels, | ||
- self.nblkptr, | ||
- self.bonus_type, | ||
- self.bonus_len)); | ||
- Ok(()) | ||
- } | ||
-} |
92
crates/zfs/dsl_dataset.rs
@@ -1,92 +0,0 @@ | ||
-use super::block_ptr::BlockPtr; | ||
-use super::from_bytes::FromBytes; | ||
- | ||
-#[repr(packed)] | ||
-pub struct DslDatasetPhys { | ||
- pub dir_obj: u64, // DMU_OT_DSL_DIR | ||
- pub prev_snap_obj: u64, // DMU_OT_DSL_DATASET | ||
- pub prev_snap_txg: u64, | ||
- pub next_snap_obj: u64, // DMU_OT_DSL_DATASET | ||
- pub snapnames_zapobj: u64, // DMU_OT_DSL_DS_SNAP_MAP 0 for snaps | ||
- pub num_children: u64, // clone/snap children, ==0 for head | ||
- pub creation_time: u64, // seconds since 1970 | ||
- pub creation_txg: u64, | ||
- pub deadlist_obj: u64, // DMU_OT_DEADLIST | ||
- // ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes | ||
- // include all blocks referenced by this dataset, including those | ||
- // shared with any other datasets. | ||
- // | ||
- pub referenced_bytes: u64, | ||
- pub compressed_bytes: u64, | ||
- pub uncompressed_bytes: u64, | ||
- pub unique_bytes: u64, // only relevant to snapshots | ||
- // The ds_fsid_guid is a 56-bit ID that can change to avoid | ||
- // collisions. The ds_guid is a 64-bit ID that will never | ||
- // change, so there is a small probability that it will collide. | ||
- // | ||
- pub fsid_guid: u64, | ||
- pub guid: u64, | ||
- pub flags: u64, // DS_FLAG_* | ||
- pub bp: BlockPtr, | ||
- pub next_clones_obj: u64, // DMU_OT_DSL_CLONES | ||
- pub props_obj: u64, // DMU_OT_DSL_PROPS for snaps | ||
- pub userrefs_obj: u64, // DMU_OT_USERREFS | ||
- pad: [u64; 5], // pad out to 320 bytes for good measure | ||
-} | ||
- | ||
-impl FromBytes for DslDatasetPhys {} | ||
- | ||
-//------------------------------------------------------------------------------------------------// | ||
- | ||
-// struct DslDataset { | ||
-// dmu_buf_user_t ds_dbu, | ||
-// | ||
-// Immutable: | ||
-// dsl_dir *ds_dir, | ||
-// dmu_buf_t *ds_dbuf, | ||
-// object: u64, | ||
-// fsid_guid: u64, | ||
-// is_snapshot: bool, | ||
-// | ||
-// only used in syncing context, only valid for non-snapshots: | ||
-// dsl_dataset *ds_prev, | ||
-// bookmarks: u64, // DMU_OTN_ZAP_METADATA | ||
-// large_blocks: bool, | ||
-// need_large_blocks: bool, | ||
-// | ||
-// has internal locking: | ||
-// dsl_deadlist_t ds_deadlist, | ||
-// bplist_t ds_pending_deadlist, | ||
-// | ||
-// protected by lock on pool's dp_dirty_datasets list | ||
-// txg_node_t ds_dirty_link, | ||
-// list_node_t ds_synced_link, | ||
-// | ||
-// ds_phys->ds_<accounting> is also protected by ds_lock. | ||
-// Protected by ds_lock: | ||
-// kmutex_t ds_lock, | ||
-// objset_t *ds_objset, | ||
-// ds_userrefs: u64, | ||
-// void *ds_owner, | ||
-// | ||
-// Long holds prevent the ds from being destroyed, they allow the | ||
-// ds to remain held even after dropping the dp_config_rwlock. | ||
-// Owning counts as a long hold. See the comments above | ||
-// dsl_pool_hold() for details. | ||
-// refcount_t ds_longholds, | ||
-// | ||
-// no locking, only for making guesses | ||
-// ds_trysnap_txg: u64, | ||
-// | ||
-// for objset_open() | ||
-// kmutex_t ds_opening_lock, | ||
-// | ||
-// ds_reserved: u64, // cached refreservation | ||
-// ds_quota: u64, // cached refquota | ||
-// | ||
-// kmutex_t ds_sendstream_lock, | ||
-// list_t ds_sendstreams, | ||
-// | ||
-// Protected by ds_lock, keep at end of struct for better locality | ||
-// char ds_snapname[MAXNAMELEN], | ||
-// } |
37
crates/zfs/dsl_dir.rs
@@ -1,37 +0,0 @@ | ||
-use super::from_bytes::FromBytes; | ||
- | ||
-const DD_USED_NUM: usize = 5; // The number of variants in DslDirUsed | ||
- | ||
-pub enum DslDirUsed { | ||
- Head = 0, | ||
- Snap, | ||
- Child, | ||
- ChildReserve, | ||
- RefReserve, | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-pub struct DslDirPhys { | ||
- pub creation_time: u64, // not actually used | ||
- pub head_dataset_obj: u64, | ||
- pub parent_obj: u64, | ||
- pub origin_obj: u64, | ||
- pub child_dir_zapobj: u64, | ||
- // how much space our children are accounting for, for leaf | ||
- // datasets, == physical space used by fs + snaps | ||
- pub used_bytes: u64, | ||
- pub compressed_bytes: u64, | ||
- pub uncompressed_bytes: u64, | ||
- // Administrative quota setting | ||
- pub quota: u64, | ||
- // Administrative reservation setting | ||
- pub reserved: u64, | ||
- pub props_zapobj: u64, | ||
- pub deleg_zapobj: u64, // dataset delegation permissions | ||
- pub flags: u64, | ||
- pub used_breakdown: [u64; DD_USED_NUM], | ||
- pub clones: u64, // dsl_dir objects | ||
- pub pad: [u64; 13], // pad out to 256 bytes for good measure | ||
-} | ||
- | ||
-impl FromBytes for DslDirPhys {} |
17
crates/zfs/dsl_pool.rs
@@ -1,17 +0,0 @@ | ||
-use super::spa; | ||
-use super::zfs; | ||
- | ||
-pub struct DslPool { | ||
- // Immutable | ||
- root_dir_obj: u64, | ||
-} | ||
- | ||
-impl DslPool { | ||
- pub fn init(spa: &mut spa::Spa, txg: u64) -> zfs::Result<Self> { | ||
- Self::open_impl(spa, txg) | ||
- } | ||
- | ||
- fn open_impl(spa: &mut spa::Spa, txg: u64) -> zfs::Result<Self> { | ||
- Ok(DslPool { root_dir_obj: 0 }) | ||
- } | ||
-} |
42
crates/zfs/dvaddr.rs
@@ -1,42 +0,0 @@ | ||
-use std::fmt; | ||
- | ||
-#[derive(Copy, Clone, Eq, Hash, Ord, PartialEq, PartialOrd)] | ||
-#[repr(packed)] | ||
-pub struct DVAddr { | ||
- pub vdev: u64, | ||
- pub offset: u64, | ||
-} | ||
- | ||
-impl DVAddr { | ||
- /// Sector address is the offset plus two vdev labels and one boot block (4 MB, or 8192 sectors) | ||
- pub fn sector(&self) -> u64 { | ||
- self.offset() + 0x2000 | ||
- } | ||
- | ||
- pub fn gang(&self) -> bool { | ||
- if self.offset & 0x8000000000000000 == 1 { | ||
- true | ||
- } else { | ||
- false | ||
- } | ||
- } | ||
- | ||
- pub fn offset(&self) -> u64 { | ||
- self.offset & 0x7FFFFFFFFFFFFFFF | ||
- } | ||
- | ||
- pub fn asize(&self) -> u64 { | ||
- (self.vdev & 0xFFFFFF) + 1 | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for DVAddr { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- try!(write!(f, | ||
- "DVAddr {{ offset: {:X}, gang: {}, asize: {:X} }}\n", | ||
- self.offset(), | ||
- self.gang(), | ||
- self.asize())); | ||
- Ok(()) | ||
- } | ||
-} |
16
crates/zfs/from_bytes.rs
@@ -1,16 +0,0 @@ | ||
-use std::{mem, ptr}; | ||
- | ||
-pub trait FromBytes: Sized { | ||
- fn from_bytes(data: &[u8]) -> Result<Self, String> { | ||
- if data.len() >= mem::size_of::<Self>() { | ||
- let s = unsafe { ptr::read(data.as_ptr() as *const Self) }; | ||
- Ok(s) | ||
- } else { | ||
- Err(format!("Error: bytes length of {} not long enough for the byte size of {}", | ||
- data.len(), | ||
- mem::size_of::<Self>())) | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl FromBytes for u64 {} |
147
crates/zfs/lzjb.rs
@@ -1,147 +0,0 @@ | ||
-const NBBY: usize = 8; // Number of bits per byte | ||
-const MATCH_BITS: usize = 6; | ||
-const MATCH_MIN: usize = 3; | ||
-const MATCH_MAX: usize = ((1 << MATCH_BITS) + (MATCH_MIN - 1)); | ||
-const OFFSET_MASK: usize = ((1 << (16 - MATCH_BITS)) - 1); | ||
-const LEMPEL_SIZE: usize = 1024; | ||
- | ||
-/// LZJB compress the bytes in `src` into `dst` | ||
-pub fn compress(src: &[u8], dst: &mut [u8]) -> usize { | ||
- let mut src_i = 0; // Current index in src | ||
- let mut dst_i = 0; // Current index in dst | ||
- | ||
- // We place 1 extra byte preceding every 8 bytes. Each bit in this byte is | ||
- // a flag that corresponds to one of the 8 bytes that delimit it. If the | ||
- // flag is set, the byte is a copy item. If the flag is 0, it is a literal | ||
- // item. We'll call this the copy flag. | ||
- | ||
- // Stores the index of the current copy flag in dst | ||
- let mut copymap = 0; | ||
- | ||
- // The current bit in the byte pointed at by `copymap` | ||
- let mut copymask: usize = 1 << (NBBY - 1); | ||
- | ||
- // This is our cache | ||
- let mut lempel = [0usize; LEMPEL_SIZE]; | ||
- | ||
- while src_i < src.len() { | ||
- copymask <<= 1; | ||
- if copymask == (1 << NBBY) { | ||
- // We've reached the end of our 8-byte cycle | ||
- if dst_i >= dst.len() - 1 - 2 * NBBY { | ||
- // If we've reached the last two bytes, we're done | ||
- return src.len(); | ||
- } | ||
- // Not done yet, reset the cycle | ||
- copymask = 1; | ||
- copymap = dst_i; // Point to our new copy flag byte | ||
- dst[dst_i] = 0; // Place the new (initially clear) copy flag byte | ||
- dst_i += 1; | ||
- } | ||
- | ||
- if src_i > src.len() - MATCH_MAX { | ||
- // Nearing the end of the data, don't bother searching for matches, | ||
- // just copy. | ||
- dst[dst_i] = src[src_i]; | ||
- src_i += 1; | ||
- dst_i += 1; | ||
- continue; | ||
- } | ||
- | ||
- // Compute hash of current 3 byte slice. It will be the index to our | ||
- // cache | ||
- let mut hash = ((src[src_i] as usize) << 16) + ((src[src_i + 1] as usize) << 8) + | ||
- (src[src_i + 2] as usize); | ||
- hash += hash >> 9; | ||
- hash += hash >> 5; | ||
- let hp = (hash as usize) & (LEMPEL_SIZE - 1); | ||
- | ||
- // Look up the current 3 byte slice in the cache. We'll verify that it's | ||
- // a valid entry later. | ||
- let offset = (src_i - lempel[hp]) & OFFSET_MASK; | ||
- let cpy = src_i - offset; | ||
- | ||
- // Set the current 3 byte slice as the most recent sighting of it in the | ||
- // cache | ||
- lempel[hp] = src_i; | ||
- | ||
- // Check that the cached item is valid | ||
- if src_i >= offset && cpy != src_i && src[src_i] == src[cpy] && | ||
- src[src_i + 1] == src[cpy + 1] && src[src_i + 2] == src[cpy + 2] { | ||
- // This cache item is valid, write a copy item | ||
- dst[copymap] |= copymask as u8; // Set the | ||
- | ||
- // Find the full length of this match. Since it was in the hash, | ||
- // we know the match length is at least 3. | ||
- let mut mlen = MATCH_MIN; | ||
- while mlen < MATCH_MAX { | ||
- if src[src_i + mlen] != src[cpy + mlen] { | ||
- break; | ||
- } | ||
- mlen += 1; | ||
- } | ||
- | ||
- // Place the match length portion of the copy item | ||
- dst[dst_i] = (((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | (offset >> NBBY)) as u8; | ||
- dst_i += 1; | ||
- | ||
- // Place the offset portion of the copy item | ||
- dst[dst_i] = offset as u8; | ||
- dst_i += 1; | ||
- | ||
- // Now we get to skip the repeated sequence! | ||
- src_i += mlen; | ||
- } else { | ||
- // Not a real cache entry, don't make a copy item | ||
- dst[dst_i] = src[src_i]; | ||
- dst_i += 1; | ||
- src_i += 1; | ||
- } | ||
- } | ||
- | ||
- return dst_i; | ||
-} | ||
- | ||
-pub fn decompress(src: &[u8], dst: &mut [u8]) -> bool { | ||
- let mut src_i = 0; | ||
- let mut dst_i = 0; | ||
- let mut copymap: u8 = 0; | ||
- let mut copymask: usize = 1 << (NBBY - 1); | ||
- | ||
- while dst_i < dst.len() { | ||
- copymask <<= 1; | ||
- if copymask == (1 << NBBY) { | ||
- // Finished another 8-byte loop, repeat | ||
- copymask = 1; // Reset the copy mask | ||
- copymap = src[src_i]; // Current byte is the new copymap | ||
- src_i += 1; | ||
- } | ||
- if (copymap & (copymask as u8)) != 0 { | ||
- // Found a copy item | ||
- let mlen = ((src[src_i] as usize) >> (NBBY - MATCH_BITS)) + MATCH_MIN; | ||
- let offset = (((src[src_i] as usize) << NBBY) | (src[src_i + 1] as usize)) & | ||
- OFFSET_MASK; | ||
- src_i += 2; | ||
- if dst_i < offset { | ||
- // Copy item points to invalid index, error | ||
- return false; | ||
- } | ||
- let mut cpy = dst_i - offset; | ||
- for _ in 0..mlen { | ||
- if dst_i >= dst.len() { | ||
- // Reached the end of the destination buffer, can't copy anymore | ||
- break; | ||
- } | ||
- dst[dst_i] = dst[cpy]; | ||
- dst_i += 1; | ||
- cpy += 1; | ||
- } | ||
- } else { | ||
- // It's a literal item, copy it directly | ||
- dst[dst_i] = src[src_i]; | ||
- dst_i += 1; | ||
- src_i += 1; | ||
- } | ||
- } | ||
- return true; | ||
-} |
621
crates/zfs/main.rs
@@ -1,621 +0,0 @@ | ||
-// To use this, please install zfs-fuse | ||
-use std::{mem, str}; | ||
-use std::fs::File; | ||
-use std::io::{Read, Write, stdin, stdout}; | ||
-use std::rc::Rc; | ||
- | ||
-use self::arcache::ArCache; | ||
-use self::dnode::{DNodePhys, ObjectType}; | ||
-use self::dmu_objset::ObjectSetPhys; | ||
-use self::block_ptr::BlockPtr; | ||
-use self::dsl_dataset::DslDatasetPhys; | ||
-use self::dsl_dir::DslDirPhys; | ||
-use self::from_bytes::FromBytes; | ||
-use self::nvpair::NvValue; | ||
-use self::space_map::SpaceMapPhys; | ||
-use self::uberblock::Uberblock; | ||
-use self::vdev::VdevLabel; | ||
- | ||
-macro_rules! readln { | ||
- () => ({ | ||
- let mut buffer = String::new(); | ||
- match stdin().read_line(&mut buffer) { | ||
- Ok(_) => Some(buffer), | ||
- Err(_) => None | ||
- } | ||
- }); | ||
-} | ||
- | ||
-pub mod arcache; | ||
-pub mod avl; | ||
-pub mod block_ptr; | ||
-pub mod dmu_objset; | ||
-pub mod dnode; | ||
-pub mod dsl_dataset; | ||
-pub mod dsl_dir; | ||
-pub mod dsl_pool; | ||
-pub mod dvaddr; | ||
-pub mod from_bytes; | ||
-pub mod lzjb; | ||
-pub mod metaslab; | ||
-pub mod nvpair; | ||
-pub mod nvstream; | ||
-pub mod spa; | ||
-pub mod space_map; | ||
-pub mod taskq; | ||
-pub mod txg; | ||
-pub mod uberblock; | ||
-pub mod util; | ||
-pub mod vdev; | ||
-pub mod vdev_file; | ||
-pub mod xdr; | ||
-pub mod zap; | ||
-pub mod zfs; | ||
-pub mod zil_header; | ||
-pub mod zio; | ||
- | ||
-pub struct ZfsReader { | ||
- pub zio: zio::Reader, | ||
- pub arc: ArCache, | ||
-} | ||
- | ||
-impl ZfsReader { | ||
- pub fn read_block(&mut self, block_ptr: &BlockPtr) -> Result<Vec<u8>, String> { | ||
- let data = self.arc.read(&mut self.zio, &block_ptr.dvas[0]); | ||
- match block_ptr.compression() { | ||
- 2 => { | ||
- // compression off | ||
- data | ||
- } | ||
- 1 | 3 => { | ||
- // lzjb compression | ||
- let mut decompressed = vec![0; (block_ptr.lsize()*512) as usize]; | ||
- lzjb::decompress(&match data { | ||
- Ok(data) => data, | ||
- Err(e) => return Err(e), | ||
- }, | ||
- &mut decompressed); | ||
- Ok(decompressed) | ||
- } | ||
- u => Err(format!("Error: Unknown compression type {}", u)), | ||
- } | ||
- } | ||
- | ||
- pub fn read_type<T: FromBytes>(&mut self, block_ptr: &BlockPtr) -> Result<T, String> { | ||
- let data = self.read_block(block_ptr); | ||
- data.and_then(|data| T::from_bytes(&data[..])) | ||
- } | ||
- | ||
- pub fn read_type_array<T: FromBytes>(&mut self, | ||
- block_ptr: &BlockPtr, | ||
- offset: usize) | ||
- -> Result<T, String> { | ||
- let data = self.read_block(block_ptr); | ||
- data.and_then(|data| T::from_bytes(&data[offset * mem::size_of::<T>()..])) | ||
- } | ||
- | ||
- pub fn uber(&mut self, _: &[u8]) -> Result<Uberblock, String> { | ||
- let mut newest_uberblock: Option<Uberblock> = None; | ||
- for i in 0..128 { | ||
- // let ub_len = 2*512; | ||
- // let ub_start = i * ub_len; | ||
- // let ub_end = ub_start + ub_len; | ||
- // if let Ok(uberblock) = Uberblock::from_bytes(&uberblocks[ub_start..ub_end]) { | ||
- if let Ok(uberblock) = Uberblock::from_bytes(&self.zio.read(256 + i * 2, 2)) { | ||
- let newest = match newest_uberblock { | ||
- Some(previous) => { | ||
- if uberblock.txg > previous.txg { | ||
- // Found a newer uberblock | ||
- true | ||
- } else { | ||
- false | ||
- } | ||
- } | ||
- // No uberblock yet, so first one we find is the newest | ||
- None => true, | ||
- }; | ||
- | ||
- if newest { | ||
- newest_uberblock = Some(uberblock); | ||
- } | ||
- } | ||
- } | ||
- | ||
- match newest_uberblock { | ||
- Some(uberblock) => Ok(uberblock), | ||
- None => Err("Failed to find valid uberblock".to_string()), | ||
- } | ||
- } | ||
-} | ||
- | ||
-#[derive(Copy, Clone, PartialEq)] | ||
-pub enum ZfsTraverse { | ||
- ThisDir, | ||
- Done, | ||
-} | ||
- | ||
-pub struct Zfs { | ||
- pub reader: ZfsReader, | ||
- pub uberblock: Uberblock, // The active uberblock | ||
- pub mos: ObjectSetPhys, | ||
- fs_objset: ObjectSetPhys, | ||
- master_node: DNodePhys, | ||
- root: u64, | ||
-} | ||
- | ||
-impl Zfs { | ||
- pub fn new(disk: File) -> Result<Self, String> { | ||
- let mut zfs_reader = ZfsReader { | ||
- zio: zio::Reader { disk: disk }, | ||
- arc: ArCache::new(), | ||
- }; | ||
- | ||
- // Read vdev label | ||
- // let vdev_label = Box::new(try!(VdevLabel::from_bytes(&zfs_reader.zio.read(0, 256 * 2)))); | ||
- // let mut xdr = xdr::MemOps::new(&mut vdev_label.nv_pairs); | ||
- // let nv_list = try!(nvstream::decode_nv_list(&mut xdr).map_err(|e| format!("{:?}", e))); | ||
- // let vdev_tree = | ||
- // match nv_list.find("vdev_tree") { | ||
- // Some(vdev_tree) => { | ||
- // vdev_tree | ||
- // }, | ||
- // None => { | ||
- // return Err("No vdev_tree in vdev label nvpairs".to_string()); | ||
- // }, | ||
- // }; | ||
- // | ||
- // let vdev_tree = | ||
- // if let NvValue::NvList(ref vdev_tree) = *vdev_tree { | ||
- // vdev_tree | ||
- // } else { | ||
- // return Err("vdev_tree is not NvValue::NvList".to_string()); | ||
- // }; | ||
- | ||
- | ||
- // Get the active uberblock | ||
- // let uberblock = try!(zfs_reader.uber(&vdev_label.uberblocks)); | ||
- let uberblock = try!(zfs_reader.uber(&[])); | ||
- | ||
- // let mos_dva = uberblock.rootbp.dvas[0]; | ||
- let mos: ObjectSetPhys = try!(zfs_reader.read_type(&uberblock.rootbp)); | ||
- let mos_bp1 = mos.meta_dnode.get_blockptr(0); | ||
- | ||
- // 2nd dnode in MOS points at the root dataset zap | ||
- let dnode1: DNodePhys = try!(zfs_reader.read_type_array(&mos_bp1, 1)); | ||
- | ||
- let root_ds_bp = dnode1.get_blockptr(0); | ||
- let root_ds: zap::MZapWrapper = try!(zfs_reader.read_type(root_ds_bp)); | ||
- | ||
- let root_ds_dnode: DNodePhys = | ||
- try!(zfs_reader.read_type_array(&mos_bp1, root_ds.chunks[0].value as usize)); | ||
- | ||
- let dsl_dir = try!(DslDirPhys::from_bytes(root_ds_dnode.get_bonus())); | ||
- let head_ds_dnode: DNodePhys = | ||
- try!(zfs_reader.read_type_array(&mos_bp1, dsl_dir.head_dataset_obj as usize)); | ||
- | ||
- let root_dataset = try!(DslDatasetPhys::from_bytes(head_ds_dnode.get_bonus())); | ||
- | ||
- let fs_objset: ObjectSetPhys = try!(zfs_reader.read_type(&root_dataset.bp)); | ||
- | ||
- let mut indirect: BlockPtr = try!(zfs_reader.read_type_array(fs_objset.meta_dnode | ||
- .get_blockptr(0), | ||
- 0)); | ||
- while indirect.level() > 0 { | ||
- indirect = try!(zfs_reader.read_type_array(&indirect, 0)); | ||
- } | ||
- | ||
- // Master node is always the second object in the object set | ||
- let master_node: DNodePhys = try!(zfs_reader.read_type_array(&indirect, 1)); | ||
- let master_node_zap: zap::MZapWrapper = | ||
- try!(zfs_reader.read_type(master_node.get_blockptr(0))); | ||
- | ||
- // Find the ROOT zap entry | ||
- let mut root = None; | ||
- for chunk in &master_node_zap.chunks { | ||
- if chunk.name() == Some("ROOT") { | ||
- root = Some(chunk.value); | ||
- break; | ||
- } | ||
- } | ||
- | ||
- let root = match root { | ||
- Some(root) => Ok(root), | ||
- None => Err("Error: failed to get the ROOT".to_string()), | ||
- }; | ||
- | ||
- Ok(Zfs { | ||
- reader: zfs_reader, | ||
- uberblock: uberblock, | ||
- mos: mos, | ||
- fs_objset: fs_objset, | ||
- master_node: master_node, | ||
- root: try!(root), | ||
- }) | ||
- } | ||
- | ||
- pub fn traverse<F, T>(&mut self, mut f: F) -> Option<T> | ||
- where F: FnMut(&mut Self, | ||
- &str, | ||
- usize, | ||
- &mut DNodePhys, | ||
- &BlockPtr, | ||
- &mut Option<T>) | ||
- -> Option<ZfsTraverse> | ||
- { | ||
- // Given the fs_objset and the object id of the root directory, we can traverse the | ||
- // directory tree. | ||
- // TODO: Cache object id of paths | ||
- // TODO: Calculate path through objset blockptr tree to use | ||
- let mut indirect: BlockPtr = self.reader | ||
- .read_type_array(self.fs_objset | ||
- .meta_dnode | ||
- .get_blockptr(0), | ||
- 0) | ||
- .unwrap(); | ||
- while indirect.level() > 0 { | ||
- indirect = self.reader.read_type_array(&indirect, 0).unwrap(); | ||
- } | ||
- // Set the cur_node to the root node, located at an L0 indirect block | ||
- let root = self.root as usize; | ||
- let mut cur_node: DNodePhys = self.reader | ||
- .read_type_array(&indirect, self.root as usize) | ||
- .unwrap(); | ||
- let mut result = None; | ||
- if f(self, "", root, &mut cur_node, &indirect, &mut result) == Some(ZfsTraverse::Done) { | ||
- return result; | ||
- } | ||
- 'traverse: loop { | ||
- // Directory dnodes point at zap objects. File/directory names are mapped to their | ||
- // fs_objset object ids. | ||
- let dir_contents: zap::MZapWrapper = self.reader | ||
- .read_type(cur_node.get_blockptr(0)) | ||
- .unwrap(); | ||
- let mut next_dir = None; | ||
- for chunk in &dir_contents.chunks { | ||
- match chunk.name() { | ||
- Some(chunk_name) => { | ||
- // Stop once we get to a null entry | ||
- if chunk_name.is_empty() { | ||
- break; | ||
- } | ||
- | ||
- let traverse = f(self, | ||
- chunk_name, | ||
- chunk.value as usize, | ||
- &mut cur_node, | ||
- &indirect, | ||
- &mut result); | ||
- if let Some(traverse) = traverse { | ||
- match traverse { | ||
- ZfsTraverse::ThisDir => { | ||
- // Found the folder we were looking for | ||
- next_dir = Some(chunk.value); | ||
- break; | ||
- } | ||
- ZfsTraverse::Done => { | ||
- break 'traverse; | ||
- } | ||
- } | ||
- } | ||
- } | ||
- None => { | ||
- // Invalid directory name | ||
- return None; | ||
- } | ||
- } | ||
- } | ||
- if next_dir.is_none() { | ||
- break; | ||
- } | ||
- } | ||
- result | ||
- } | ||
- | ||
- pub fn read_file(&mut self, path: &str) -> Option<Vec<u8>> { | ||
- let path = path.trim_matches('/'); // Robust against different url styles | ||
- let path_end_index = path.rfind('/').map(|i| i + 1).unwrap_or(0); | ||
- let path_end = &path[path_end_index..]; | ||
- let mut folder_iter = path.split('/'); | ||
- let mut folder = folder_iter.next(); | ||
- | ||
- let file_contents = self.traverse(|zfs, name, node_id, node, indirect, result| { | ||
- let mut this_dir = false; | ||
- if let Some(folder) = folder { | ||
- if name == folder { | ||
- *node = zfs.reader | ||
- .read_type_array(indirect, node_id as usize) | ||
- .unwrap(); | ||
- if name == path_end { | ||
- if node.object_type != ObjectType::PlainFileContents { | ||
- // Not a file | ||
- return Some(ZfsTraverse::Done); | ||
- } | ||
- // Found the file | ||
- let file_contents = zfs.reader | ||
- .read_block(node.get_blockptr(0)) | ||
- .unwrap(); | ||
- // TODO: Read file size from ZPL rather than look for terminating 0 | ||
- let file_contents: Vec<u8> = file_contents.into_iter() | ||
- .take_while(|c| *c != 0) | ||
- .collect(); | ||
- *result = Some(file_contents); | ||
- return Some(ZfsTraverse::Done); | ||
- } | ||
- this_dir = true; | ||
- } | ||
- } | ||
- if this_dir { | ||
- if node.object_type != ObjectType::DirectoryContents { | ||
- // Not a folder | ||
- return Some(ZfsTraverse::Done); | ||
- } | ||
- folder = folder_iter.next(); | ||
- return Some(ZfsTraverse::ThisDir); | ||
- } | ||
- None | ||
- }); | ||
- | ||
- file_contents | ||
- } | ||
- | ||
- pub fn ls(&mut self, path: &str) -> Option<Vec<String>> { | ||
- let path = path.trim_matches('/'); // Robust against different url styles | ||
- let path_end_index = path.rfind('/').map(|i| i + 1).unwrap_or(0); | ||
- let path_end = &path[path_end_index..]; | ||
- let mut folder_iter = path.split('/'); | ||
- let mut folder = folder_iter.next(); | ||
- | ||
- let file_contents = self.traverse(|zfs, name, node_id, node, indirect, result| { | ||
- let mut this_dir = false; | ||
- if let Some(folder) = folder { | ||
- if name == folder { | ||
- if folder == path_end { | ||
- *node = zfs.reader | ||
- .read_type_array(indirect, node_id as usize) | ||
- .unwrap(); | ||
- let dir_contents: zap::MZapWrapper = zfs.reader | ||
- .read_type(node.get_blockptr(0)) | ||
- .unwrap(); | ||
- | ||
- let ls: Vec<String> = dir_contents.chunks | ||
- .iter() | ||
- .map(|x| { | ||
- if x.value & 0xF000000000000000 == | ||
- 0x4000000000000000 { | ||
- x.name().unwrap().to_string() + | ||
- "/" | ||
- } else { | ||
- x.name().unwrap().to_string() | ||
- } | ||
- }) | ||
- .take_while(|x| !x.is_empty()) | ||
- .collect(); | ||
- *result = Some(ls); | ||
- return Some(ZfsTraverse::Done); | ||
- } | ||
- this_dir = true; | ||
- } | ||
- } | ||
- if this_dir { | ||
- folder = folder_iter.next(); | ||
- return Some(ZfsTraverse::ThisDir); | ||
- } | ||
- None | ||
- }); | ||
- | ||
- file_contents | ||
- } | ||
-} | ||
- | ||
-// TODO: Find a way to remove all the to_string's | ||
-fn main() { | ||
- println!("Type open zfs.img to open the image file"); | ||
- | ||
- let mut zfs_option: Option<Zfs> = None; | ||
- | ||
- 'reading: loop { | ||
- print!("# "); | ||
- stdout().flush(); | ||
- | ||
- if let Some(line) = readln!() { | ||
- let args: Vec<String> = line.trim().split(' ').map(|arg| arg.to_string()).collect(); | ||
- | ||
- if let Some(command) = args.get(0) { | ||
- let mut close = false; | ||
- match zfs_option { | ||
- Some(ref mut zfs) => { | ||
- if command == "uber" { | ||
- let ref uberblock = zfs.uberblock; | ||
- // 128 KB of ubers after 128 KB of other stuff | ||
- println!("Newest Uberblock {:X}", zfs.uberblock.magic); | ||
- println!("Version {}", uberblock.version); | ||
- println!("TXG {}", uberblock.txg); | ||
- println!("GUID {:X}", uberblock.guid_sum); | ||
- println!("Timestamp {}", uberblock.timestamp); | ||
- println!("ROOTBP[0] {:?}", uberblock.rootbp.dvas[0]); | ||
- println!("ROOTBP[1] {:?}", uberblock.rootbp.dvas[1]); | ||
- println!("ROOTBP[2] {:?}", uberblock.rootbp.dvas[2]); | ||
- } else if command == "spa_import" { | ||
- let mut nvpairs_buffer = zfs.reader.zio.read(32, 224); | ||
- let mut xdr = xdr::MemOps::new(&mut nvpairs_buffer); | ||
- let nv_list = nvstream::decode_nv_list(&mut xdr).unwrap(); | ||
- let name = nv_list.get::<&String>("name").unwrap().clone(); | ||
- let spa = spa::Spa::import(name, nv_list).unwrap(); | ||
- } else if command == "vdev_label" { | ||
- match VdevLabel::from_bytes(&zfs.reader.zio.read(0, 256 * 2)) { | ||
- Ok(ref mut vdev_label) => { | ||
- let mut xdr = xdr::MemOps::new(&mut vdev_label.nv_pairs); | ||
- let nv_list = nvstream::decode_nv_list(&mut xdr).unwrap(); | ||
- println!("Got nv_list:\n{:?}", nv_list); | ||
- match nv_list.find("vdev_tree") { | ||
- Some(vdev_tree) => { | ||
- println!("Got vdev_tree"); | ||
- | ||
- let vdev_tree = if let NvValue::NvList(ref vdev_tree) = | ||
- *vdev_tree { | ||
- Some(vdev_tree) | ||
- } else { | ||
- None | ||
- }; | ||
- | ||
- match vdev_tree.unwrap().find("metaslab_array") { | ||
- Some(metaslab_array) => { | ||
- println!("Got metaslab_array"); | ||
- if let NvValue::Uint64(metaslab_array) = | ||
- *metaslab_array { | ||
- // Get metaslab array dnode | ||
- let metaslab_array = metaslab_array as usize; | ||
- let ma_dnode: Result<DNodePhys, String> = | ||
- zfs.reader | ||
- .read_type_array(zfs.mos | ||
- .meta_dnode | ||
- .get_blockptr(0), | ||
- metaslab_array); | ||
- let ma_dnode = ma_dnode.unwrap(); // TODO | ||
- | ||
- // Get a spacemap object id | ||
- let sm_id: Result<u64, String> = | ||
- zfs.reader.read_type_array(ma_dnode.get_blockptr(0), 0); | ||
- let sm_id = sm_id.unwrap(); // TODO | ||
- | ||
- let sm_dnode: Result<DNodePhys, String> = | ||
- zfs.reader | ||
- .read_type_array(zfs.mos | ||
- .meta_dnode | ||
- .get_blockptr(0), | ||
- sm_id as usize); | ||
- let sm_dnode = sm_dnode.unwrap(); // TODO | ||
- let space_map_phys = SpaceMapPhys::from_bytes(sm_dnode.get_bonus()).unwrap(); // TODO | ||
- let space_map: Result<Vec<u8>, String> = | ||
- zfs.reader | ||
- .read_block(sm_dnode.get_blockptr(0)); | ||
- | ||
- println!("got space map id: {:?}", sm_id); | ||
- println!("got space map dnode: {:?}", sm_dnode); | ||
- println!("got space map phys: {:?}", | ||
- space_map_phys); | ||
- // println!("got space map: {:?}", &space_map.unwrap()[0..64]); | ||
- | ||
- let mut range_tree: avl::Tree<space_map::Entry, | ||
- u64> = | ||
- avl::Tree::new(Rc::new(|x| x.offset())); | ||
- // space_map::load_space_map_avl(&space_map::SpaceMap { size: 30 }, | ||
- // &mut range_tree, | ||
- // &space_map.unwrap(), | ||
- // space_map::MapType::Alloc).unwrap(); | ||
- } else { | ||
- println!("Invalid metaslab_array NvValue \ | ||
- type. Expected Uint64."); | ||
- } | ||
- } | ||
- None => { | ||
- println!("No `metaslab_array` in vdev_tree"); | ||
- } | ||
- }; | ||
- } | ||
- None => { | ||
- println!("No `vdev_tree` in vdev_label nvpairs"); | ||
- } | ||
- } | ||
- } | ||
- Err(e) => { | ||
- println!("Couldn't read vdev_label: {}", e); | ||
- } | ||
- } | ||
- } else if command == "file" { | ||
- match args.get(1) { | ||
- Some(arg) => { | ||
- let file = zfs.read_file(arg); | ||
- match file { | ||
- Some(file) => { | ||
- println!("File contents: {}", | ||
- str::from_utf8(&file).unwrap()); | ||
- } | ||
- None => println!("Failed to read file"), | ||
- } | ||
- } | ||
- None => println!("Usage: file <path>"), | ||
- } | ||
- } else if command == "ls" { | ||
- match args.get(1) { | ||
- Some(arg) => { | ||
- let ls = zfs.ls(arg); | ||
- match ls { | ||
- Some(ls) => { | ||
- for item in &ls { | ||
- print!("{}\t", item); | ||
- } | ||
- } | ||
- None => println!("Failed to read directory"), | ||
- } | ||
- } | ||
- None => println!("Usage: ls <path>"), | ||
- } | ||
- } else if command == "dump" { | ||
- match args.get(1) { | ||
- Some(arg) => { | ||
- if let Ok(sector) = arg.parse::<usize>() { | ||
- println!("Dump sector: {}", sector); | ||
- | ||
- let data = zfs.reader.zio.read(sector, 1); | ||
- for i in 0..data.len() { | ||
- if i % 32 == 0 { | ||
- print!("\n{:X}:", i); | ||
- } | ||
- if let Some(byte) = data.get(i) { | ||
- print!(" {:X}", *byte); | ||
- } else { | ||
- println!(" !"); | ||
- } | ||
- } | ||
- print!("\n"); | ||
- } else { | ||
- println!("Sector not a number"); | ||
- } | ||
- } | ||
- None => println!("No sector specified!"), | ||
- } | ||
- } else if command == "close" { | ||
- println!("Closing"); | ||
- close = true; | ||
- } else if command == "exit" { | ||
- break 'reading; | ||
- } else { | ||
- println!("Commands: uber vdev_label file ls dump close exit"); | ||
- } | ||
- } | ||
- None => { | ||
- if command == "open" { | ||
- match args.get(1) { | ||
- Some(arg) => { | ||
- match File::open(arg) { | ||
- Ok(file) => { | ||
- let zfs = Zfs::new(file); | ||
- if let Err(ref e) = zfs { | ||
- println!("Error: {:?}", e); | ||
- } else { | ||
- println!("Open: {}", arg); | ||
- } | ||
- zfs_option = zfs.ok(); | ||
- } | ||
- Err(err) => println!("Failed to open {}: {}", arg, err), | ||
- } | ||
- } | ||
- None => println!("No file specified!"), | ||
- } | ||
- } else if command == "exit" { | ||
- break 'reading; | ||
- } else { | ||
- println!("Commands: open exit"); | ||
- } | ||
- } | ||
- } | ||
- if close { | ||
- zfs_option = None; | ||
- } | ||
- } | ||
- } else { | ||
- break 'reading; | ||
- } | ||
- } | ||
-} |
587
crates/zfs/metaslab.rs
@@ -1,587 +0,0 @@ | ||
-use std::cmp; | ||
-use std::rc::Rc; | ||
- | ||
-use super::avl; | ||
-use super::dmu_objset::ObjectSet; | ||
-use super::space_map::{self, Segment, SpaceMap}; | ||
-use super::taskq::{self, Taskq}; | ||
-use super::txg; | ||
-use util; | ||
-use super::vdev; | ||
-use super::zfs; | ||
- | ||
-// A metaslab class encompasses a category of allocatable top-level vdevs. | ||
-// Each top-level vdev is associated with a metaslab group which defines | ||
-// the allocatable region for that vdev. Examples of these categories include | ||
-// "normal" for data block allocations (i.e. main pool allocations) or "log" | ||
-// for allocations designated for intent log devices (i.e. slog devices). | ||
-// When a block allocation is requested from the SPA it is associated with a | ||
-// metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging | ||
-// to the class can be used to satisfy that request. Allocations are done | ||
-// by traversing the metaslab groups that are linked off of the `rotor` field. | ||
-// This rotor points to the next metaslab group where allocations will be | ||
-// attempted. Allocating a block is a 3 step process -- select the metaslab | ||
-// group, select the metaslab, and then allocate the block. The metaslab | ||
-// class defines the low-level block allocator that will be used as the | ||
-// final step in allocation. These allocators are pluggable allowing each class | ||
-// to use a block allocator that best suits that class. | ||
-// | ||
-pub struct MetaslabClass { | ||
- // spa: *Spa, | ||
- // rotor: *MetaslabGroup, | ||
- ops: Rc<MetaslabOps>, | ||
- aliquot: u64, | ||
- alloc_groups: u64, // # of allocatable groups | ||
- alloc: u64, // total allocated space | ||
- deferred: u64, // total deferred frees | ||
- space: u64, // total space (alloc + free) | ||
- dspace: u64, /* total deflated space | ||
- * histogram: [u64, RANGE_TREE_HISTOGRAM_SIZE], | ||
- * fastwrite_lock: kmutex_t, */ | ||
-} | ||
- | ||
-impl MetaslabClass { | ||
- pub fn create(ops: Rc<MetaslabOps>) -> MetaslabClass { | ||
- // mutex_init(&mc->mc_fastwrite_lock, NULL, MUTEX_DEFAULT, NULL); | ||
- | ||
- MetaslabClass { | ||
- // rotor: NULL, | ||
- ops: ops, | ||
- aliquot: 0, | ||
- alloc_groups: 0, | ||
- alloc: 0, | ||
- deferred: 0, | ||
- space: 0, | ||
- dspace: 0, | ||
- } | ||
- } | ||
-} | ||
- | ||
-// Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) | ||
-// of a top-level vdev. They are linked togther to form a circular linked | ||
-// list and can belong to only one metaslab class. Metaslab groups may become | ||
-// ineligible for allocations for a number of reasons such as limited free | ||
-// space, fragmentation, or going offline. When this happens the allocator will | ||
-// simply find the next metaslab group in the linked list and attempt | ||
-// to allocate from that group instead. | ||
-// | ||
-pub struct MetaslabGroup { | ||
- // lock: kmutex_t, | ||
- metaslab_tree: avl::Tree<MetaslabAvlNode, (u64, u64)>, | ||
- aliquot: u64, | ||
- allocatable: bool, // can we allocate? | ||
- free_capacity: u64, // percentage free | ||
- bias: i64, | ||
- activation_count: i64, | ||
- ms_class: Rc<MetaslabClass>, | ||
- // vdev: vdev::TreeIndex, | ||
- taskq: Taskq, | ||
- // prev: *MetaslabGroup, | ||
- // next: *MetaslabGroup, | ||
- fragmentation: u64, // histogram: [u64; RANGE_TREE_HISTOGRAM_SIZE], | ||
-} | ||
- | ||
-impl MetaslabGroup { | ||
- pub fn create(ms_class: Rc<MetaslabClass>) -> Self { | ||
- let metaslab_key = Rc::new(|ms: &MetaslabAvlNode| (ms.weight, ms.start)); | ||
- let taskq = Taskq::new("metaslab_group_taskq".to_string(), | ||
- // metaslab_load_pct | ||
- 4, | ||
- 10, | ||
- -1i64 as u64, | ||
- // TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC | ||
- 0); | ||
- | ||
- MetaslabGroup { | ||
- // lock: kmutex_t, | ||
- metaslab_tree: avl::Tree::new(metaslab_key), | ||
- aliquot: 0, | ||
- allocatable: false, // can we allocate? | ||
- free_capacity: 0, // percentage free | ||
- bias: 0, | ||
- activation_count: 0, | ||
- ms_class: ms_class, | ||
- // vdev: vdev, | ||
- taskq: taskq, | ||
- // prev: *MetaslabGroup, | ||
- // next: *MetaslabGroup, | ||
- fragmentation: 0, // histogram: [0; RANGE_TREE_HISTOGRAM_SIZE], | ||
- } | ||
- } | ||
- | ||
- pub fn add(&mut self, index: usize, m: &Metaslab) { | ||
- self.metaslab_tree.insert(MetaslabAvlNode { | ||
- index: index, | ||
- start: m.start, | ||
- weight: m.weight, | ||
- }); | ||
- } | ||
- | ||
- pub fn activate(&mut self) { | ||
- // metaslab_class_t *mc = self.class; | ||
- // metaslab_group_t *mgprev, *mgnext; | ||
- // | ||
- // assert!(spa_config_held(ms_class.spa, SCL_ALLOC, RW_WRITER)); | ||
- // | ||
- // assert!(ms_class.rotor != mg); | ||
- // assert!(self.prev == NULL); | ||
- // assert!(self.next == NULL); | ||
- // assert!(self.activation_count <= 0); | ||
- // | ||
- // if (++self.activation_count <= 0) | ||
- // return; | ||
- // | ||
- // self.aliquot = metaslab_aliquot * cmp::max(1, self.vdev->vdev_children); | ||
- // metaslab_group_alloc_update(mg); | ||
- // | ||
- // if (mgprev = ms_class.rotor) == NULL { | ||
- // self.prev = mg; | ||
- // self.next = mg; | ||
- // } else { | ||
- // mgnext = mgprev->mg_next; | ||
- // self.prev = mgprev; | ||
- // self.next = mgnext; | ||
- // mgprev->mg_next = mg; | ||
- // mgnext->mg_prev = mg; | ||
- // } | ||
- // ms_class.rotor = mg; | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// This value defines the number of elements in the lbas array. The value | ||
-// of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. | ||
-// This is the equivalent of highbit(UINT64_MAX). | ||
-const MAX_LBAS: usize = 64; | ||
- | ||
-// Each metaslab maintains a set of in-core trees to track metaslab operations. | ||
-// The in-core free tree (ms_tree) contains the current list of free segments. | ||
-// As blocks are allocated, the allocated segment are removed from the ms_tree | ||
-// and added to a per txg allocation tree (ms_alloctree). As blocks are freed, | ||
-// they are added to the per txg free tree (ms_freetree). These per txg | ||
-// trees allow us to process all allocations and frees in syncing context | ||
-// where it is safe to update the on-disk space maps. One additional in-core | ||
-// tree is maintained to track deferred frees (ms_defertree). Once a block | ||
-// is freed it will move from the ms_freetree to the ms_defertree. A deferred | ||
-// free means that a block has been freed but cannot be used by the pool | ||
-// until TXG_DEFER_SIZE transactions groups later. For example, a block | ||
-// that is freed in txg 50 will not be available for reallocation until | ||
-// txg 52 (50 + TXG_DEFER_SIZE). This provides a safety net for uberblock | ||
-// rollback. A pool could be safely rolled back TXG_DEFERS_SIZE | ||
-// transactions groups and ensure that no block has been reallocated. | ||
-// | ||
-// The simplified transition diagram looks like this: | ||
-// | ||
-// | ||
-// ALLOCATE | ||
-// | | ||
-// V | ||
-// free segment (tree) --------> alloc_tree ----> (write to space map) | ||
-// ^ | ||
-// | | ||
-// | free_tree <--- FREE | ||
-// | | | ||
-// | | | ||
-// | | | ||
-// +----------- defer_tree <-------+---------> (write to space map) | ||
-// | ||
-// | ||
-// Each metaslab's space is tracked in a single space map in the MOS, | ||
-// which is only updated in syncing context. Each time we sync a txg, | ||
-// we append the allocs and frees from that txg to the space map. | ||
-// The pool space is only updated once all metaslabs have finished syncing. | ||
-// | ||
-// To load the in-core free tree we read the space map from disk. | ||
-// This object contains a series of alloc and free records that are | ||
-// combined to make up the list of all free segments in this metaslab. These | ||
-// segments are represented in-core by the ms_tree and are stored in an | ||
-// AVL tree. | ||
-// | ||
-// As the space map grows (as a result of the appends) it will | ||
-// eventually become space-inefficient. When the metaslab's in-core free tree | ||
-// is zfs_condense_pct/100 times the size of the minimal on-disk | ||
-// representation, we rewrite it in its minimized form. If a metaslab | ||
-// needs to condense then we must set the condensing flag to ensure | ||
-// that allocations are not performed on the metaslab that is being written. | ||
-// | ||
- | ||
-pub struct Metaslab { | ||
- // lock: kmutex_t, | ||
- // load_cv: kcondvar_t, | ||
- space_map: Option<SpaceMap>, | ||
- ops: Rc<MetaslabOps>, | ||
- id: u64, | ||
- start: u64, | ||
- size: u64, | ||
- fragmentation: u64, | ||
- | ||
- // Sorted by start | ||
- alloc_tree: Vec<avl::Tree<space_map::Segment, u64>>, // txg::TXG_SIZE | ||
- free_tree: Vec<avl::Tree<space_map::Segment, u64>>, // txg::TXG_SIZE | ||
- defer_tree: Vec<avl::Tree<space_map::Segment, u64>>, // txg::DEFER_SIZE | ||
- tree: avl::Tree<space_map::Segment, u64>, | ||
- | ||
- condensing: bool, | ||
- condense_wanted: bool, | ||
- loaded: bool, | ||
- loading: bool, | ||
- | ||
- defer_space: i64, // sum of defermap[] space | ||
- weight: u64, // weight vs others in group | ||
- access_txg: u64, | ||
- | ||
- // The metaslab block allocators can optionally use a size-ordered | ||
- // range tree and/or an array of LBAs. Not all allocators use | ||
- // this functionality. The size_tree should always contain the | ||
- // same number of segments as the tree. The only difference | ||
- // is that the size_tree is ordered by segment sizes. | ||
- size_tree: avl::Tree<space_map::Segment, u64>, // Sorted by size | ||
- lbas: [u64; MAX_LBAS], /* group: *MetaslabGroup, | ||
- * avl_node_t ms_group_node, // node in metaslab group tree | ||
- * txg_node_t ms_txg_node, // per-txg dirty metaslab links */ | ||
-} | ||
- | ||
-impl Metaslab { | ||
- pub fn new(ops: Rc<MetaslabOps>, | ||
- id: u64, | ||
- start: u64, | ||
- size: u64, | ||
- space_map: Option<SpaceMap>) | ||
- -> Self { | ||
- let seg_key_start = Rc::new(|seg: &Segment| seg.start); | ||
- let seg_key_size = Rc::new(|seg: &Segment| seg.size); | ||
- | ||
- Metaslab { | ||
- // lock: kmutex_t, | ||
- // load_cv: kcondvar_t, | ||
- space_map: space_map, | ||
- ops: ops, | ||
- id: id, | ||
- start: start, | ||
- size: size, | ||
- fragmentation: 0, | ||
- | ||
- alloc_tree: (0..txg::TXG_SIZE).map(|x| avl::Tree::new(seg_key_start.clone())).collect(), | ||
- free_tree: (0..txg::TXG_SIZE).map(|x| avl::Tree::new(seg_key_start.clone())).collect(), | ||
- defer_tree: (0..txg::DEFER_SIZE) | ||
- .map(|x| avl::Tree::new(seg_key_start.clone())) | ||
- .collect(), | ||
- tree: avl::Tree::new(seg_key_start), | ||
- | ||
- condensing: false, | ||
- condense_wanted: false, | ||
- loaded: false, | ||
- loading: false, | ||
- | ||
- defer_space: 0, | ||
- weight: 0, | ||
- access_txg: 0, | ||
- | ||
- size_tree: avl::Tree::new(seg_key_size), | ||
- lbas: [0; MAX_LBAS], /* group: *MetaslabGroup, | ||
- * avl_node_t ms_group_node, // node in metaslab group tree | ||
- * txg_node_t ms_txg_node, // per-txg dirty metaslab links */ | ||
- } | ||
- } | ||
- | ||
- pub fn init(mos: &mut ObjectSet, | ||
- vdev: &mut vdev::Vdev, | ||
- id: u64, | ||
- object: u64, | ||
- txg: u64) | ||
- -> zfs::Result<Self> { | ||
- // We assume this is a top-level vdev | ||
- let vdev_top = try!(vdev.top.as_mut().ok_or(zfs::Error::Invalid)); | ||
- | ||
- // mutex_init(&ms.lock, NULL, MUTEX_DEFAULT, NULL); | ||
- // cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); | ||
- let start = id << vdev_top.ms_shift; | ||
- let size = 1 << vdev_top.ms_shift; | ||
- | ||
- // We only open space map objects that already exist. All others | ||
- // will be opened when we finally allocate an object for it. | ||
- let space_map = if object != 0 { | ||
- Some(try!(SpaceMap::open(mos, | ||
- object, | ||
- start, | ||
- size, | ||
- vdev.ashift as u8 /* , &ms.lock */))) | ||
- } else { | ||
- None | ||
- }; | ||
- | ||
- let mut metaslab = Self::new(vdev_top.ms_group.ms_class.ops.clone(), | ||
- id, | ||
- start, | ||
- size, | ||
- space_map); | ||
- | ||
- vdev_top.ms_group.add(id as usize, &metaslab); | ||
- | ||
- // metaslab.fragmentation = metaslab_fragmentation(metaslab); | ||
- | ||
- // If we're opening an existing pool (txg == 0) or creating | ||
- // a new one (txg == TXG_INITIAL), all space is available now. | ||
- // If we're adding space to an existing pool, the new space | ||
- // does not become available until after this txg has synced. | ||
- if txg <= txg::TXG_INITIAL as u64 { | ||
- // metaslab_sync_done(metaslab, 0); | ||
- } | ||
- | ||
- // If metaslab_debug_load is set and we're initializing a metaslab | ||
- // that has an allocated space_map object then load the its space | ||
- // map so that can verify frees. | ||
- // if metaslab_debug_load && metaslab.space_map.is_some() { | ||
- // try!(metaslab.load()); | ||
- // } | ||
- | ||
- | ||
- // if txg != 0 { | ||
- // vdev.dirty(0, NULL, txg); | ||
- // vdev.dirty(vdev::DIRTY_METASLAB, ms, txg); | ||
- // } | ||
- | ||
- Ok(metaslab) | ||
- } | ||
- | ||
- pub fn load(&mut self) -> zfs::Result<()> { | ||
- let mut result = Ok(()); | ||
- // assert!(MUTEX_HELD(&self.lock)); | ||
- assert!(!self.loaded); | ||
- assert!(!self.loading); | ||
- | ||
- self.loading = true; | ||
- | ||
- // If the space map has not been allocated yet, then treat | ||
- // all the space in the metaslab as free and add it to the | ||
- // tree. | ||
- if let Some(ref mut space_map) = self.space_map { | ||
- // result = space_map.load(&mut self.tree, space_map::AllocType::Free); | ||
- } else { | ||
- self.tree.insert(Segment { | ||
- start: self.start, | ||
- size: self.size, | ||
- }); | ||
- } | ||
- | ||
- self.loaded = result.is_ok(); | ||
- self.loading = false; | ||
- | ||
- if self.loaded { | ||
- for t in 0..txg::DEFER_SIZE { | ||
- // self.defer_tree[t].in_order(range_tree_remove, self.tree); | ||
- } | ||
- } | ||
- // cv_broadcast(&self.load_cv); | ||
- result | ||
- } | ||
- | ||
- pub fn load_wait(&self) { | ||
- while self.loading { | ||
- assert!(!self.loaded); | ||
- // cv_wait(&msp->ms_load_cv, &msp->ms_lock); | ||
- } | ||
- } | ||
- | ||
- fn activate(&mut self, activation_weight: u64) -> zfs::Result<()> { | ||
- // TODO | ||
- // assert!(MUTEX_HELD(&self.lock)); | ||
- // | ||
- // if self.weight & METASLAB_ACTIVE_MASK == 0 { | ||
- // self.load_wait(); | ||
- // if !self.loaded { | ||
- // if let Err(e) = self.load() { | ||
- // metaslab_group_sort(self.group, msp, 0); | ||
- // return Err(e); | ||
- // } | ||
- // } | ||
- // | ||
- // metaslab_group_sort(self.group, self, self.weight | activation_weight); | ||
- // } | ||
- // assert!(self.loaded); | ||
- // assert!(self.weight & METASLAB_ACTIVE_MASK); | ||
- | ||
- | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct MetaslabOps { | ||
- pub alloc: fn(ms: &mut Metaslab, size: u64) -> u64, | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// The first-fit block allocator | ||
-pub fn ff_alloc(ms: &mut Metaslab, size: u64) -> u64 { | ||
- // Find the largest power of 2 block size that evenly divides the | ||
- // requested size. This is used to try to allocate blocks with similar | ||
- // alignment from the same area of the metaslab (i.e. same cursor | ||
- // bucket) but it does not guarantee that other allocations sizes | ||
- // may exist in the same region. | ||
- let align = size & -(size as i64) as u64; | ||
- let ref mut cursor = ms.lbas[(util::highbit64(align) - 1) as usize]; | ||
- let ref mut tree = ms.tree; | ||
- | ||
- // return metaslab_block_picker(tree, cursor, size, align); | ||
- return 0; | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-// This is a helper function that can be used by the allocator to find | ||
-// a suitable block to allocate. This will search the specified AVL | ||
-// tree looking for a block that matches the specified criteria. | ||
-// fn metaslab_block_picker(tree: &mut avl::Tree, cursor: &mut u64, size: u64, align: u64) -> u64 { | ||
-// range_seg_t *rs, rsearch; | ||
-// avl_index_t where; | ||
-// | ||
-// rsearch.rs_start = *cursor; | ||
-// rsearch.rs_end = *cursor + size; | ||
-// | ||
-// rs = tree.find(&rsearch, &where); | ||
-// if rs == NULL { | ||
-// rs = tree.nearest(where, AVL_AFTER); | ||
-// } | ||
-// | ||
-// while rs != NULL { | ||
-// let offset: u64 = util::p2roundup(rs->rs_start, align); | ||
-// | ||
-// if offset + size <= rs->rs_end { | ||
-// cursor = offset + size; | ||
-// return (offset); | ||
-// } | ||
-// rs = AVL_NEXT(t, rs); | ||
-// } | ||
-// | ||
-// If we know we've searched the whole map (*cursor == 0), give up. | ||
-// Otherwise, reset the cursor to the beginning and try again. | ||
-// if *cursor == 0 { | ||
-// return (-1ULL); | ||
-// } | ||
-// | ||
-// cursor = 0; | ||
-// return metaslab_block_picker(tree, cursor, size, align); | ||
-// } | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-struct MetaslabAvlNode { | ||
- index: usize, | ||
- weight: u64, | ||
- start: u64, | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// Allow allocations to switch to gang blocks quickly. We do this to | ||
-// avoid having to load lots of space_maps in a given txg. There are, | ||
-// however, some cases where we want to avoid "fast" ganging and instead | ||
-// we want to do an exhaustive search of all metaslabs on this device. | ||
-// Currently we don't allow any gang, slog, or dump device related allocations | ||
-// to "fast" gang. | ||
-// fn can_fast_gang(flags) -> bool { | ||
-// (flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | METASLAB_GANG_AVOID) == 0 | ||
-// } | ||
- | ||
- | ||
-const METASLAB_WEIGHT_PRIMARY: u64 = 1 << 63; | ||
-const METASLAB_WEIGHT_SECONDARY: u64 = 1 << 62; | ||
-const METASLAB_ACTIVE_MASK: u64 = METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY; | ||
- | ||
-// Metaslab granularity, in bytes. This is roughly similar to what would be | ||
-// referred to as the "stripe size" in traditional RAID arrays. In normal | ||
-// operation, we will try to write this amount of data to a top-level vdev | ||
-// before moving on to the next one. | ||
-static metaslab_aliquot: usize = 512 << 10; | ||
- | ||
-// static metaslab_gang_bang: u64 = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ | ||
- | ||
-// The in-core space map representation is more compact than its on-disk form. | ||
-// The zfs_condense_pct determines how much more compact the in-core | ||
-// space_map representation must be before we compact it on-disk. | ||
-// Values should be greater than or equal to 100. | ||
-static zfs_condense_pct: isize = 200; | ||
- | ||
-// Condensing a metaslab is not guaranteed to actually reduce the amount of | ||
-// space used on disk. In particular, a space map uses data in increments of | ||
-// MAX(1 << ashift, space_map_blksz), so a metaslab might use the | ||
-// same number of blocks after condensing. Since the goal of condensing is to | ||
-// reduce the number of IOPs required to read the space map, we only want to | ||
-// condense when we can be sure we will reduce the number of blocks used by the | ||
-// space map. Unfortunately, we cannot precisely compute whether or not this is | ||
-// the case in metaslab_should_condense since we are holding ms_lock. Instead, | ||
-// we apply the following heuristic: do not condense a spacemap unless the | ||
-// uncondensed size consumes greater than zfs_metaslab_condense_block_threshold | ||
-// blocks. | ||
-static zfs_metaslab_condense_block_threshold: isize = 4; | ||
- | ||
-// The zfs_mg_noalloc_threshold defines which metaslab groups should | ||
-// be eligible for allocation. The value is defined as a percentage of | ||
-// free space. Metaslab groups that have more free space than | ||
-// zfs_mg_noalloc_threshold are always eligible for allocations. Once | ||
-// a metaslab group's free space is less than or equal to the | ||
-// zfs_mg_noalloc_threshold the allocator will avoid allocating to that | ||
-// group unless all groups in the pool have reached zfs_mg_noalloc_threshold. | ||
-// Once all groups in the pool reach zfs_mg_noalloc_threshold then all | ||
-// groups are allowed to accept allocations. Gang blocks are always | ||
-// eligible to allocate on any metaslab group. The default value of 0 means | ||
-// no metaslab group will be excluded based on this criterion. | ||
-static zfs_mg_noalloc_threshold: isize = 0; | ||
- | ||
-// Metaslab groups are considered eligible for allocations if their | ||
-// fragmenation metric (measured as a percentage) is less than or equal to | ||
-// zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold | ||
-// then it will be skipped unless all metaslab groups within the metaslab | ||
-// class have also crossed this threshold. | ||
-static zfs_mg_fragmentation_threshold: isize = 85; | ||
- | ||
-// Allow metaslabs to keep their active state as long as their fragmentation | ||
-// percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An | ||
-// active metaslab that exceeds this threshold will no longer keep its active | ||
-// status allowing better metaslabs to be selected. | ||
-static zfs_metaslab_fragmentation_threshold: isize = 70; | ||
- | ||
-// When set will load all metaslabs when pool is first opened. | ||
-static metaslab_debug_load: isize = 0; | ||
- | ||
-// When set will prevent metaslabs from being unloaded. | ||
-static metaslab_debug_unload: isize = 0; | ||
- | ||
-// Minimum size which forces the dynamic allocator to change | ||
-// it's allocation strategy. Once the space map cannot satisfy | ||
-// an allocation of this size then it switches to using more | ||
-// aggressive strategy (i.e search by size rather than offset). | ||
-// static metaslab_df_alloc_threshold: u64 = SPA_MAXBLOCKSIZE; | ||
- | ||
-// The minimum free space, in percent, which must be available | ||
-// in a space map to continue allocations in a first-fit fashion. | ||
-// Once the space_map's free space drops below this level we dynamically | ||
-// switch to using best-fit allocations. | ||
-static metaslab_df_free_pct: isize = 4; | ||
- | ||
-// Percentage of all cpus that can be used by the metaslab taskq. | ||
-static metaslab_load_pct: isize = 50; | ||
- | ||
-// Determines how many txgs a metaslab may remain loaded without having any | ||
-// allocations from it. As long as a metaslab continues to be used we will | ||
-// keep it loaded. | ||
-static metaslab_unload_delay: usize = txg::TXG_SIZE * 2; | ||
- | ||
-// Max number of metaslabs per group to preload. | ||
-// static metaslab_preload_limit: isize = SPA_DVAS_PER_BP; | ||
- | ||
-// Enable/disable preloading of metaslab. | ||
-static metaslab_preload_enabled: bool = true; | ||
- | ||
-// Enable/disable fragmentation weighting on metaslabs. | ||
-static metaslab_fragmentation_factor_enabled: bool = true; | ||
- | ||
-// Enable/disable lba weighting (i.e. outer tracks are given preference). | ||
-static metaslab_lba_weighting_enabled: bool = true; | ||
- | ||
-// Enable/disable metaslab group biasing. | ||
-static metaslab_bias_enabled: bool = true; | ||
- | ||
-// static uint64_t metaslab_fragmentation(metaslab_t *); |
385
crates/zfs/nvpair.rs
@@ -1,385 +0,0 @@ | ||
-use std::fmt; | ||
- | ||
-// nvp implementation version | ||
-pub const NV_VERSION: i32 = 0; | ||
- | ||
-// nvlist header | ||
-// #[derive(Debug)] | ||
-pub struct NvList { | ||
- pub version: i32, | ||
- pub nvflag: u32, // persistent flags | ||
- pub pairs: Vec<(String, NvValue)>, | ||
-} | ||
- | ||
-impl NvList { | ||
- pub fn new(nvflag: u32) -> Self { | ||
- NvList { | ||
- version: NV_VERSION, | ||
- nvflag: nvflag, | ||
- pairs: Vec::new(), | ||
- } | ||
- } | ||
- | ||
- pub fn add(&mut self, name: String, value: NvValue) { | ||
- self.pairs.push((name, value)); | ||
- } | ||
- | ||
- pub fn find(&self, name: &str) -> Option<&NvValue> { | ||
- for pair in &self.pairs { | ||
- if pair.0 == name { | ||
- return Some(&pair.1); | ||
- } | ||
- } | ||
- None | ||
- } | ||
- | ||
- pub fn find_mut(&mut self, name: &str) -> Option<&mut NvValue> { | ||
- for pair in &mut self.pairs { | ||
- if pair.0 == name { | ||
- return Some(&mut pair.1); | ||
- } | ||
- } | ||
- None | ||
- } | ||
- | ||
- pub fn get<'a, T: GetNvValue<'a>>(&'a self, name: &str) -> Option<T> { | ||
- self.find(name).and_then(|x| GetNvValue::get(x)) | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for NvList { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- try!(write!(f, | ||
- "NvList {{ version: {:X}, nvflag: {:X}, pairs: [\n", | ||
- self.version, | ||
- self.nvflag)); | ||
- for &(ref name, ref value) in &self.pairs { | ||
- if name.is_empty() { | ||
- break; | ||
- } | ||
- try!(write!(f, "{} : {:?}\n", name, value)); | ||
- } | ||
- try!(write!(f, "] }}\n")); | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-// TODO Auto implement Debug. format! currently crashes with big u32 values | ||
-// #[derive(Debug)] | ||
-pub enum NvValue { | ||
- Unknown, | ||
- Boolean, | ||
- Byte(u8), | ||
- Int16(i16), | ||
- Uint16(u16), | ||
- Int32(i32), | ||
- Uint32(u32), | ||
- Int64(i64), | ||
- Uint64(u64), | ||
- String(String), | ||
- ByteArray(Vec<u8>), | ||
- Int16Array(Vec<i16>), | ||
- Uint16Array(Vec<u16>), | ||
- Int32Array(Vec<i32>), | ||
- Uint32Array(Vec<u32>), | ||
- Int64Array(Vec<i64>), | ||
- Uint64Array(Vec<u64>), | ||
- StringArray(Vec<String>), | ||
- HrTime(i64), | ||
- NvList(NvList), | ||
- NvListArray(Vec<NvList>), | ||
- BooleanValue(bool), | ||
- Int8(i8), | ||
- Uint8(u8), | ||
- BooleanArray(Vec<bool>), | ||
- Int8Array(Vec<i8>), | ||
- Uint8Array(Vec<u8>), | ||
-} | ||
- | ||
-impl NvValue { | ||
- pub fn data_type(&self) -> DataType { | ||
- match *self { | ||
- NvValue::Unknown => DataType::Unknown, | ||
- NvValue::Boolean => DataType::Boolean, | ||
- NvValue::Byte(_) => DataType::Byte, | ||
- NvValue::Int16(_) => DataType::Int16, | ||
- NvValue::Uint16(_) => DataType::Uint16, | ||
- NvValue::Int32(_) => DataType::Int32, | ||
- NvValue::Uint32(_) => DataType::Uint32, | ||
- NvValue::Int64(_) => DataType::Int64, | ||
- NvValue::Uint64(_) => DataType::Uint64, | ||
- NvValue::String(_) => DataType::String, | ||
- NvValue::ByteArray(_) => DataType::ByteArray, | ||
- NvValue::Int16Array(_) => DataType::Int16Array, | ||
- NvValue::Uint16Array(_) => DataType::Uint16Array, | ||
- NvValue::Int32Array(_) => DataType::Int32Array, | ||
- NvValue::Uint32Array(_) => DataType::Uint32Array, | ||
- NvValue::Int64Array(_) => DataType::Int64Array, | ||
- NvValue::Uint64Array(_) => DataType::Uint64Array, | ||
- NvValue::StringArray(_) => DataType::StringArray, | ||
- NvValue::HrTime(_) => DataType::HrTime, | ||
- NvValue::NvList(_) => DataType::NvList, | ||
- NvValue::NvListArray(_) => DataType::NvListArray, | ||
- NvValue::BooleanValue(_) => DataType::BooleanValue, | ||
- NvValue::Int8(_) => DataType::Int8, | ||
- NvValue::Uint8(_) => DataType::Uint8, | ||
- NvValue::BooleanArray(_) => DataType::BooleanArray, | ||
- NvValue::Int8Array(_) => DataType::Int8Array, | ||
- NvValue::Uint8Array(_) => DataType::Uint8Array, | ||
- } | ||
- } | ||
- | ||
- pub fn num_elements(&self) -> usize { | ||
- match *self { | ||
- NvValue::Unknown => 1, | ||
- NvValue::Boolean => 1, | ||
- NvValue::Byte(_) => 1, | ||
- NvValue::Int16(_) => 1, | ||
- NvValue::Uint16(_) => 1, | ||
- NvValue::Int32(_) => 1, | ||
- NvValue::Uint32(_) => 1, | ||
- NvValue::Int64(_) => 1, | ||
- NvValue::Uint64(_) => 1, | ||
- NvValue::String(_) => 1, | ||
- NvValue::ByteArray(ref a) => a.len(), | ||
- NvValue::Int16Array(ref a) => a.len(), | ||
- NvValue::Uint16Array(ref a) => a.len(), | ||
- NvValue::Int32Array(ref a) => a.len(), | ||
- NvValue::Uint32Array(ref a) => a.len(), | ||
- NvValue::Int64Array(ref a) => a.len(), | ||
- NvValue::Uint64Array(ref a) => a.len(), | ||
- NvValue::StringArray(ref a) => a.len(), | ||
- NvValue::HrTime(_) => 1, | ||
- NvValue::NvList(_) => 1, | ||
- NvValue::NvListArray(ref a) => a.len(), | ||
- NvValue::BooleanValue(_) => 1, | ||
- NvValue::Int8(_) => 1, | ||
- NvValue::Uint8(_) => 1, | ||
- NvValue::BooleanArray(ref a) => a.len(), | ||
- NvValue::Int8Array(ref a) => a.len(), | ||
- NvValue::Uint8Array(ref a) => a.len(), | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for NvValue { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- match *self { | ||
- NvValue::Int64(v) => write!(f, "Int64(0x{:X})", v), | ||
- NvValue::Uint64(v) => write!(f, "Uint64(0x{:X})", v), | ||
- NvValue::NvList(ref v) => write!(f, "NvList({:?})", v), | ||
- NvValue::NvListArray(ref v) => { | ||
- try!(write!(f, "NvListArray([")); | ||
- for nv_list in v { | ||
- try!(write!(f, "NvList({:?})", nv_list)); | ||
- } | ||
- write!(f, "])") | ||
- } | ||
- NvValue::String(ref v) => write!(f, "String({})", v), | ||
- _ => write!(f, "{:?}", self), | ||
- } | ||
- } | ||
-} | ||
- | ||
-#[derive(Copy, Clone, Debug)] | ||
-pub enum DataType { | ||
- Unknown = 0, | ||
- Boolean, | ||
- Byte, | ||
- Int16, | ||
- Uint16, | ||
- Int32, | ||
- Uint32, | ||
- Int64, | ||
- Uint64, | ||
- String, | ||
- ByteArray, | ||
- Int16Array, | ||
- Uint16Array, | ||
- Int32Array, | ||
- Uint32Array, | ||
- Int64Array, | ||
- Uint64Array, | ||
- StringArray, | ||
- HrTime, | ||
- NvList, | ||
- NvListArray, | ||
- BooleanValue, | ||
- Int8, | ||
- Uint8, | ||
- BooleanArray, | ||
- Int8Array, | ||
- Uint8Array, | ||
-} | ||
- | ||
-impl DataType { | ||
- pub fn from_u8(u: u8) -> Option<DataType> { | ||
- match u { | ||
- 0 => Some(DataType::Unknown), | ||
- 1 => Some(DataType::Boolean), | ||
- 2 => Some(DataType::Byte), | ||
- 3 => Some(DataType::Int16), | ||
- 4 => Some(DataType::Uint16), | ||
- 5 => Some(DataType::Int32), | ||
- 6 => Some(DataType::Uint32), | ||
- 7 => Some(DataType::Int64), | ||
- 8 => Some(DataType::Uint64), | ||
- 9 => Some(DataType::String), | ||
- 10 => Some(DataType::ByteArray), | ||
- 11 => Some(DataType::Int16Array), | ||
- 12 => Some(DataType::Uint16Array), | ||
- 13 => Some(DataType::Int32Array), | ||
- 14 => Some(DataType::Uint32Array), | ||
- 15 => Some(DataType::Int64Array), | ||
- 16 => Some(DataType::Uint64Array), | ||
- 17 => Some(DataType::StringArray), | ||
- 18 => Some(DataType::HrTime), | ||
- 19 => Some(DataType::NvList), | ||
- 20 => Some(DataType::NvListArray), | ||
- 21 => Some(DataType::BooleanValue), | ||
- 22 => Some(DataType::Int8), | ||
- 23 => Some(DataType::Uint8), | ||
- 24 => Some(DataType::BooleanArray), | ||
- 25 => Some(DataType::Int8Array), | ||
- 26 => Some(DataType::Uint8Array), | ||
- _ => None, | ||
- } | ||
- } | ||
- | ||
- pub fn to_u8(self) -> u8 { | ||
- match self { | ||
- DataType::Unknown => 0, | ||
- DataType::Boolean => 1, | ||
- DataType::Byte => 2, | ||
- DataType::Int16 => 3, | ||
- DataType::Uint16 => 4, | ||
- DataType::Int32 => 5, | ||
- DataType::Uint32 => 6, | ||
- DataType::Int64 => 7, | ||
- DataType::Uint64 => 8, | ||
- DataType::String => 9, | ||
- DataType::ByteArray => 10, | ||
- DataType::Int16Array => 11, | ||
- DataType::Uint16Array => 12, | ||
- DataType::Int32Array => 13, | ||
- DataType::Uint32Array => 14, | ||
- DataType::Int64Array => 15, | ||
- DataType::Uint64Array => 16, | ||
- DataType::StringArray => 17, | ||
- DataType::HrTime => 18, | ||
- DataType::NvList => 19, | ||
- DataType::NvListArray => 20, | ||
- DataType::BooleanValue => 21, | ||
- DataType::Int8 => 22, | ||
- DataType::Uint8 => 23, | ||
- DataType::BooleanArray => 24, | ||
- DataType::Int8Array => 25, | ||
- DataType::Uint8Array => 26, | ||
- } | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub trait GetNvValue<'a>: Sized { | ||
- fn get(value: &'a NvValue) -> Option<Self>; | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for bool { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::BooleanValue(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for u8 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Byte(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for u16 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Uint16(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for u32 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Uint32(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for u64 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Uint64(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for i16 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Int16(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for i32 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Int32(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for i64 { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::Int64(v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for &'a String { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::String(ref v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for &'a NvList { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::NvList(ref v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl<'a> GetNvValue<'a> for &'a Vec<NvList> { | ||
- fn get(value: &'a NvValue) -> Option<Self> { | ||
- match *value { | ||
- NvValue::NvListArray(ref v) => Some(v), | ||
- _ => None, | ||
- } | ||
- } | ||
-} |
266
crates/zfs/nvstream.rs
@@ -1,266 +0,0 @@ | ||
-use std::mem; | ||
- | ||
-use super::nvpair::{DataType, NV_VERSION, NvList, NvValue}; | ||
-use super::xdr; | ||
- | ||
-// nvlist pack encoding | ||
-const NV_ENCODE_NATIVE: u8 = 0; | ||
-const NV_ENCODE_XDR: u8 = 1; | ||
- | ||
-// nvlist pack endian | ||
-const NV_BIG_ENDIAN: u8 = 0; | ||
-const NV_LITTLE_ENDIAN: u8 = 1; | ||
- | ||
-// nvlist persistent unique name flags, stored in nvl_nvflags | ||
-const NV_UNIQUE_NAME: u32 = 0x1; | ||
-const NV_UNIQUE_NAME_TYPE: u32 = 0x2; | ||
- | ||
-// nvlist lookup pairs related flags | ||
-const NV_FLAG_NOENTOK: isize = 0x1; | ||
- | ||
-// NvList XDR format: | ||
-// - header (encoding and endian): 4 bytes | ||
-// - nvl version: 4 bytes | ||
-// - nv flags: 4 bytes | ||
-// - nv pairs: | ||
-// - encoded size: 4 bytes | ||
-// - decoded size: 4 bytes | ||
-// - name: xdr string | len: 4 bytes, data: len+(4 - len%4) bytes | ||
-// - data type: 4 bytes | ||
-// - num elements: 4 bytes | ||
-// - data | ||
-// - 2 terminating zeros: 4 bytes | ||
-// | ||
-// NOTE: XDR aligns all of the smaller integer types to be 4 bytes, so `encode_u8` is actually | ||
-// writing 4 bytes | ||
-// | ||
-// I don't know why the ZFS developers decided to use i32's everywhere. Even for clearly | ||
-// unsigned things like array lengths. | ||
- | ||
-/// Name value stream header | ||
-#[derive(Debug)] | ||
-pub struct NvsHeader { | ||
- encoding: u8, // nvs encoding method | ||
- endian: u8, // nvs endian | ||
- reserved1: u8, // reserved for future use | ||
- reserved2: u8, // reserved for future use | ||
-} | ||
- | ||
-/// Encodes a NvList in XDR format | ||
-pub fn encode_nv_list(xdr: &mut xdr::Xdr, nv_list: &NvList) -> xdr::XdrResult<()> { | ||
- try!(encode_nv_list_header(xdr)); | ||
- | ||
- // Encode version and nvflag | ||
- try!(xdr.encode_i32(nv_list.version)); | ||
- try!(xdr.encode_u32(nv_list.nvflag)); | ||
- | ||
- // Encode the pairs | ||
- for &(ref name, ref value) in &nv_list.pairs { | ||
- // Encode name | ||
- // let encoded_size = 0; | ||
- // let decoded_size = 0; | ||
- try!(xdr.encode_string(name)); | ||
- | ||
- // TODO | ||
- | ||
- // Encode data type | ||
- try!(xdr.encode_u8(value.data_type().to_u8())); | ||
- | ||
- // Encode the number of elements | ||
- try!(xdr.encode_i32(value.num_elements() as i32)); | ||
- | ||
- // Encode the value | ||
- } | ||
- | ||
- // Encode 2 terminating zeros | ||
- try!(xdr.encode_i32(0)); | ||
- try!(xdr.encode_i32(0)); | ||
- Ok(()) | ||
-} | ||
- | ||
-fn encode_nv_list_header(xdr: &mut xdr::Xdr) -> xdr::XdrResult<()> { | ||
- let header = NvsHeader { | ||
- encoding: NV_ENCODE_XDR, | ||
- endian: NV_LITTLE_ENDIAN, | ||
- reserved1: 0, | ||
- reserved2: 0, | ||
- }; | ||
- let header_bytes: [u8; 4] = unsafe { mem::transmute(header) }; | ||
- try!(xdr.encode_opaque(&header_bytes)); | ||
- Ok(()) | ||
-} | ||
- | ||
-/// Decodes a NvList in XDR format | ||
-pub fn decode_nv_list(xdr: &mut xdr::Xdr) -> xdr::XdrResult<NvList> { | ||
- try!(decode_nv_list_header(xdr)); | ||
- | ||
- decode_nv_list_embedded(xdr) | ||
-} | ||
- | ||
-pub fn decode_nv_list_embedded(xdr: &mut xdr::Xdr) -> xdr::XdrResult<NvList> { | ||
- // Decode version and nvflag | ||
- let version = try!(xdr.decode_i32()); | ||
- let nvflag = try!(xdr.decode_u32()); | ||
- | ||
- // TODO: Give an actual error | ||
- if version != NV_VERSION { | ||
- return Err(xdr::XdrError); | ||
- } | ||
- | ||
- let mut nv_list = NvList::new(nvflag); | ||
- | ||
- // Decode the pairs | ||
- loop { | ||
- // Decode decoded/decoded size | ||
- let encoded_size = try!(xdr.decode_u32()); | ||
- let decoded_size = try!(xdr.decode_u32()); | ||
- | ||
- // Check for 2 terminating zeros | ||
- if encoded_size == 0 && decoded_size == 0 { | ||
- break; | ||
- } | ||
- | ||
- // Decode name | ||
- let name = try!(xdr.decode_string()); | ||
- | ||
- // Decode data type | ||
- let data_type = match DataType::from_u8(try!(xdr.decode_u8())) { | ||
- Some(dt) => dt, | ||
- None => { | ||
- return Err(xdr::XdrError); | ||
- } | ||
- }; | ||
- | ||
- // Decode the number of elements | ||
- let num_elements = try!(xdr.decode_i32()) as usize; | ||
- | ||
- // Decode the value | ||
- let value = try!(decode_nv_value(xdr, data_type, num_elements)); | ||
- | ||
- // Add the value to the list | ||
- nv_list.pairs.push((name, value)); | ||
- } | ||
- | ||
- Ok(nv_list) | ||
-} | ||
- | ||
-fn decode_nv_list_header(xdr: &mut xdr::Xdr) -> xdr::XdrResult<()> { | ||
- let mut bytes: [u8; 4] = [0; 4]; | ||
- try!(xdr.decode_opaque(&mut bytes)); | ||
- let header: NvsHeader = unsafe { mem::transmute(bytes) }; | ||
- | ||
- if header.encoding != NV_ENCODE_XDR { | ||
- return Err(xdr::XdrError); | ||
- } | ||
- Ok(()) | ||
-} | ||
- | ||
-fn decode_nv_value(xdr: &mut xdr::Xdr, | ||
- data_type: DataType, | ||
- num_elements: usize) | ||
- -> xdr::XdrResult<NvValue> { | ||
- match data_type { | ||
- DataType::Unknown => Ok(NvValue::Unknown), | ||
- DataType::Boolean => Ok(NvValue::Boolean), | ||
- DataType::Byte => Ok(NvValue::Byte(try!(xdr.decode_u8()))), | ||
- DataType::Int16 => Ok(NvValue::Int16(try!(xdr.decode_i16()))), | ||
- DataType::Uint16 => Ok(NvValue::Uint16(try!(xdr.decode_u16()))), | ||
- DataType::Int32 => Ok(NvValue::Int32(try!(xdr.decode_i32()))), | ||
- DataType::Uint32 => Ok(NvValue::Uint32(try!(xdr.decode_u32()))), | ||
- DataType::Int64 => Ok(NvValue::Int64(try!(xdr.decode_i64()))), | ||
- DataType::Uint64 => Ok(NvValue::Uint64(try!(xdr.decode_u64()))), | ||
- DataType::String => Ok(NvValue::String(try!(xdr.decode_string()))), | ||
- DataType::ByteArray => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u8()); | ||
- } | ||
- Ok(NvValue::ByteArray(v)) | ||
- } | ||
- DataType::Int16Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_i16()); | ||
- } | ||
- Ok(NvValue::Int16Array(v)) | ||
- } | ||
- DataType::Uint16Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u16()); | ||
- } | ||
- Ok(NvValue::Uint16Array(v)) | ||
- } | ||
- DataType::Int32Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_i32()); | ||
- } | ||
- Ok(NvValue::Int32Array(v)) | ||
- } | ||
- DataType::Uint32Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u32()); | ||
- } | ||
- Ok(NvValue::Uint32Array(v)) | ||
- } | ||
- DataType::Int64Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_i64()); | ||
- } | ||
- Ok(NvValue::Int64Array(v)) | ||
- } | ||
- DataType::Uint64Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u64()); | ||
- } | ||
- Ok(NvValue::Uint64Array(v)) | ||
- } | ||
- DataType::StringArray => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u64()); | ||
- } | ||
- Ok(NvValue::Uint64Array(v)) | ||
- } | ||
- DataType::HrTime => Ok(NvValue::HrTime(try!(xdr.decode_i64()))), | ||
- DataType::NvList => { | ||
- let nv_list = try!(decode_nv_list_embedded(xdr)); | ||
- Ok(NvValue::NvList(nv_list)) | ||
- } | ||
- DataType::NvListArray => { | ||
- let mut v = Vec::with_capacity(num_elements); | ||
- for _ in 0..num_elements { | ||
- v.push(try!(decode_nv_list_embedded(xdr))); | ||
- } | ||
- Ok(NvValue::NvListArray(v)) | ||
- } | ||
- DataType::BooleanValue => Ok(NvValue::BooleanValue(try!(xdr.decode_bool()))), | ||
- DataType::Int8 => Ok(NvValue::Int8(try!(xdr.decode_i8()))), | ||
- DataType::Uint8 => Ok(NvValue::Uint8(try!(xdr.decode_u8()))), | ||
- DataType::BooleanArray => { | ||
- let mut v = vec![false; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_bool()); | ||
- } | ||
- Ok(NvValue::BooleanArray(v)) | ||
- } | ||
- DataType::Int8Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_i8()); | ||
- } | ||
- Ok(NvValue::Int8Array(v)) | ||
- } | ||
- DataType::Uint8Array => { | ||
- let mut v = vec![0; num_elements]; | ||
- for v in &mut v { | ||
- *v = try!(xdr.decode_u8()); | ||
- } | ||
- Ok(NvValue::Uint8Array(v)) | ||
- } | ||
- } | ||
-} |
319
crates/zfs/spa.rs
@@ -1,319 +0,0 @@ | ||
-use std::cmp; | ||
-use std::rc::Rc; | ||
- | ||
-use super::avl; | ||
-use super::dmu_objset::ObjectSet; | ||
-use super::dsl_pool; | ||
-use super::metaslab::{self, MetaslabClass}; | ||
-use super::nvpair::{NvList, NvValue}; | ||
-use super::taskq::Taskq; | ||
-use super::txg; | ||
-use super::uberblock::Uberblock; | ||
-use super::vdev; | ||
-use super::zfs; | ||
-use super::zio; | ||
- | ||
-pub enum ImportType { | ||
- Existing, | ||
- Assemble, | ||
-} | ||
- | ||
-// Storage pool allocator | ||
-pub struct Spa { | ||
- name: String, // Pool name | ||
- config: NvList, | ||
- state: zfs::PoolState, | ||
- load_state: zfs::SpaLoadState, | ||
- zio_taskq: Vec<Vec<SpaTaskqs>>, | ||
- // dsl_pool: DslPool, | ||
- normal_class: Rc<MetaslabClass>, // normal data class | ||
- log_class: Rc<MetaslabClass>, // intent log data class | ||
- first_txg: u64, | ||
- mos: ObjectSet, | ||
- vdev_tree: vdev::Tree, | ||
- root_vdev: vdev::TreeIndex, | ||
- // ubsync: Uberblock, // Last synced uberblock | ||
- // uberblock: Uberblock, // Current active uberblock | ||
- did: u64, // if procp != p0, did of t1 | ||
-} | ||
- | ||
-impl Spa { | ||
- pub fn create(name: String, nvroot: &NvList) -> zfs::Result<Self> { | ||
- let mut config = NvList::new(0); | ||
- config.add("name".to_string(), NvValue::String(name.clone())); | ||
- Self::new(name, config, vdev::AllocType::Add) | ||
- } | ||
- | ||
- pub fn import(name: String, config: NvList) -> zfs::Result<Self> { | ||
- let load_state = zfs::SpaLoadState::Import; | ||
- | ||
- // note that mos_config is true - we trust the user's config in this case | ||
- let mut spa = try!(Self::load(name, config, load_state, ImportType::Existing, true)); | ||
- | ||
- spa.activate(); | ||
- | ||
- Ok(spa) | ||
- } | ||
- | ||
- // pub fn open(&mut self) -> zfs::Result<()> { | ||
- // let load_state = zfs::SpaLoadState::Open; | ||
- // if self.state == zfs::PoolState::Uninitialized { | ||
- // First time opening | ||
- // self.activate(); | ||
- // try!(self.load(load_state, ImportType::Existing, false)); | ||
- // } | ||
- // | ||
- // Ok(()) | ||
- // } | ||
- | ||
- fn new(name: String, config: NvList, vdev_alloc_type: vdev::AllocType) -> zfs::Result<Self> { | ||
- let metaslab_ops = Rc::new(metaslab::MetaslabOps { alloc: metaslab::ff_alloc }); | ||
- let normal_class = Rc::new(MetaslabClass::create(metaslab_ops.clone())); | ||
- let log_class = Rc::new(MetaslabClass::create(metaslab_ops)); | ||
- | ||
- // Parse vdev tree | ||
- let mut vdev_tree = vdev::Tree::new(); | ||
- let root_vdev = { | ||
- let nvroot: &NvList = try!(config.get("vdev_tree").ok_or(zfs::Error::Invalid)); | ||
- try!(vdev_tree.parse(&normal_class, nvroot, None, vdev_alloc_type)) | ||
- }; | ||
- | ||
- Ok(Spa { | ||
- name: name, | ||
- config: config, | ||
- state: zfs::PoolState::Uninitialized, | ||
- load_state: zfs::SpaLoadState::None, | ||
- zio_taskq: Vec::new(), | ||
- // dsl_pool: blah, | ||
- normal_class: normal_class, | ||
- log_class: log_class, | ||
- first_txg: 0, | ||
- mos: ObjectSet, | ||
- vdev_tree: vdev_tree, | ||
- root_vdev: root_vdev, | ||
- did: 0, | ||
- }) | ||
- } | ||
- | ||
- fn load(name: String, | ||
- config: NvList, | ||
- load_state: zfs::SpaLoadState, | ||
- import_type: ImportType, | ||
- mos_config: bool) | ||
- -> zfs::Result<Self> { | ||
- let pool_guid = try!(config.get("pool_guid").ok_or(zfs::Error::Invalid)); | ||
- | ||
- let mut spa = try!(Self::load_impl(name, | ||
- pool_guid, | ||
- config, | ||
- load_state, | ||
- import_type, | ||
- mos_config)); | ||
- spa.load_state = zfs::SpaLoadState::None; | ||
- | ||
- Ok(spa) | ||
- } | ||
- | ||
- /// mosconfig: Whether `config` came from on-disk MOS and so is trusted, or was user-made and so | ||
- /// is untrusted. | ||
- fn load_impl(name: String, | ||
- pool_guid: u64, | ||
- config: NvList, | ||
- load_state: zfs::SpaLoadState, | ||
- import_type: ImportType, | ||
- mos_config: bool) | ||
- -> zfs::Result<Self> { | ||
- // Determine the vdev allocation type from import type | ||
- let vdev_alloc_type = match import_type { | ||
- ImportType::Existing => vdev::AllocType::Load, | ||
- ImportType::Assemble => vdev::AllocType::Split, | ||
- }; | ||
- | ||
- let mut spa = try!(Self::new(name, config, vdev_alloc_type)); | ||
- spa.load_state = load_state; | ||
- | ||
- // Create "The Godfather" zio to hold all async IOs | ||
- // spa.spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); | ||
- // for i in 0..max_ncpus { | ||
- // spa.async_zio_root[i] = | ||
- // Zio::root(spa, None, None, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); | ||
- // } | ||
- | ||
- | ||
- // TODO: Try to open all vdevs, loading each label in the process. | ||
- | ||
- // TODO | ||
- // Find the best uberblock. | ||
- // vdev_uberblock_load(rvd, ub, &label); | ||
- | ||
- // If we weren't able to find a single valid uberblock, return failure. | ||
- // if ub.txg == 0 { | ||
- // return spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO); | ||
- // } | ||
- | ||
- | ||
- // Initialize internal structures | ||
- spa.state = zfs::PoolState::Active; | ||
- // spa.ubsync = spa.uberblock; | ||
- // spa.verify_min_txg = | ||
- // if spa.extreme_rewind { | ||
- // txg::TXG_INITIAL - 1 | ||
- // } else { | ||
- // spa.last_synced_txg() - txg::DEFER_SIZE - 1; | ||
- // }; | ||
- // spa.first_txg = | ||
- // if spa.last_ubsync_txg { spa.last_ubsync_txg } else { spa.last_synced_txg() + 1 }; | ||
- // spa.claim_max_txg = spa.first_txg; | ||
- // spa.prev_software_version = ub.software_version; | ||
- | ||
- // spa.dsl_pool = try!(dsl_pool::DslPool::init(&mut spa, spa.first_txg)); | ||
- // if error { return spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO); } | ||
- // spa.meta_objset = spa.dsl_pool.meta_objset; | ||
- | ||
- // Load stuff for the top-level and leaf vdevs | ||
- spa.vdev_tree.load(&mut spa.mos, spa.root_vdev); | ||
- | ||
- Ok(spa) | ||
- } | ||
- | ||
- fn activate(&mut self) { | ||
- // assert!(self.state == zfs::PoolState::Uninitialized); | ||
- | ||
- self.state = zfs::PoolState::Active; | ||
- | ||
- // TODO: maybe start the spa thread | ||
- | ||
- self.create_zio_taskqs(); | ||
- | ||
- self.did = 0; | ||
- } | ||
- | ||
- // fn taskqs_init(&mut self, t: zio::Type, q: zio::TaskqType) { | ||
- // const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; | ||
- // zti_modes mode = ztip.mode; | ||
- // let value = ztip.value; | ||
- // let count = ztip.count; | ||
- // let ref tqs = self.zio_taskq[t][q]; | ||
- // let flags = TASKQ_DYNAMIC; | ||
- // let mut batch: bool = false; | ||
- // | ||
- // if mode == ZTI_MODE_NULL { | ||
- // tqs.count = 0; | ||
- // tqs.taskq = NULL; | ||
- // return; | ||
- // } | ||
- // | ||
- // assert!(count > 0); | ||
- // | ||
- // tqs.count = count; | ||
- // tqs.taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); | ||
- // | ||
- // match mode { | ||
- // ZTI_MODE_FIXED => { | ||
- // assert!(value >= 1); | ||
- // value = cmp::max(value, 1); | ||
- // }, | ||
- // ZTI_MODE_BATCH => { | ||
- // batch = true; | ||
- // flags |= TASKQ_THREADS_CPU_PCT; | ||
- // value = zio_taskq_batch_pct; | ||
- // }, | ||
- // _ => { | ||
- // panic!("unrecognized mode for %s_%s taskq (%u:%u) in spa_activate()", | ||
- // zio_type_name[t], zio_taskq_types[q], mode, value); | ||
- // }, | ||
- // } | ||
- // | ||
- // for i in 0..count { | ||
- // taskq_t *tq; | ||
- // char name[32]; | ||
- // | ||
- // if (count > 1) { | ||
- // snprintf(name, sizeof (name), "%s_%s_%u", | ||
- // zio_type_name[t], zio_taskq_types[q], i); | ||
- // } else { | ||
- // snprintf(name, sizeof (name), "%s_%s", | ||
- // zio_type_name[t], zio_taskq_types[q]); | ||
- // } | ||
- // | ||
- // if zio_taskq_sysdc && spa->spa_proc != &p0 { | ||
- // if batch { | ||
- // flags |= TASKQ_DC_BATCH; | ||
- // } | ||
- // | ||
- // tq = taskq_create_sysdc(name, value, 50, INT_MAX, | ||
- // spa->spa_proc, zio_taskq_basedc, flags); | ||
- // } else { | ||
- // pri_t pri = maxclsyspri; | ||
- // The write issue taskq can be extremely CPU | ||
- // intensive. Run it at slightly less important | ||
- // priority than the other taskqs. Under Linux this | ||
- // means incrementing the priority value on platforms | ||
- // like illumos it should be decremented. | ||
- // if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) | ||
- // pri += 1; | ||
- // | ||
- // tq = taskq_create_proc(name, value, pri, 50, | ||
- // INT_MAX, spa->spa_proc, flags); | ||
- // } | ||
- // | ||
- // tqs->taskq[i] = tq; | ||
- // } | ||
- // } | ||
- | ||
- fn create_zio_taskqs(&mut self) { | ||
- for t in 0..zio::NUM_TYPES { | ||
- for q in 0..zio::NUM_TASKQ_TYPES { | ||
- // self.taskqs_init(t, q); | ||
- } | ||
- } | ||
- } | ||
- | ||
- fn last_synced_txg(&self) -> u64 { | ||
- // TODO | ||
- // self.ubsync.ub_txg | ||
- 0 | ||
- } | ||
- | ||
- fn first_txg(&self) -> u64 { | ||
- self.first_txg | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-struct ZioTaskqInfo { | ||
- // mode: zti_modes_t, | ||
- value: usize, | ||
- count: usize, | ||
-} | ||
- | ||
-struct SpaTaskqs { | ||
- count: usize, | ||
- taskq: Vec<Vec<Taskq>>, | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct SpaNamespace { | ||
- // TODO: Use &str instead of String as key type. Lifetimes are hard. | ||
- avl: avl::Tree<Spa, String>, // AVL tree of Spa sorted by name | ||
-} | ||
- | ||
-impl SpaNamespace { | ||
- pub fn new() -> Self { | ||
- SpaNamespace { avl: avl::Tree::new(Rc::new(|x| x.name.clone())) } | ||
- } | ||
- | ||
- pub fn add(&mut self, spa: Spa) { | ||
- self.avl.insert(spa); | ||
- } | ||
- | ||
- pub fn find(&self, name: String) -> Option<&Spa> { | ||
- self.avl.find(name) | ||
- } | ||
- | ||
- pub fn find_mut(&mut self, name: String) -> Option<&mut Spa> { | ||
- self.avl.find_mut(name) | ||
- } | ||
-} |
207
crates/zfs/space_map.rs
@@ -1,207 +0,0 @@ | ||
-use std::{fmt, mem}; | ||
- | ||
-use super::avl; | ||
-use super::dmu_objset::ObjectSet; | ||
-use super::from_bytes::FromBytes; | ||
-use super::zfs; | ||
- | ||
-const SPACE_MAP_HISTOGRAM_SIZE: usize = 32; | ||
- | ||
-/// The `SpaceMapPhys` is the on-disk representation of the space map. | ||
-/// Consumers of space maps should never reference any of the members of this | ||
-/// structure directly. These members may only be updated in syncing context. | ||
-/// | ||
-/// Note the smp_object is no longer used but remains in the structure | ||
-/// for backward compatibility. | ||
-/// | ||
-/// The smp_histogram maintains a histogram of free regions. Each | ||
-/// bucket, smp_histogram[i], contains the number of free regions | ||
-/// whose size is: | ||
-/// 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) | ||
-#[derive(Debug)] | ||
-pub struct SpaceMapPhys { | ||
- object: u64, // on-disk space map object | ||
- objsize: u64, // size of the object | ||
- alloc: u64, /* space allocated from the map | ||
- * pad: [u64; 5], // reserved | ||
- * histogram: [u64; SPACE_MAP_HISTOGRAM_SIZE], */ | ||
-} | ||
- | ||
-impl FromBytes for SpaceMapPhys {} | ||
- | ||
-pub struct SpaceMap { | ||
- start: u64, // start of map | ||
- size: u64, // size of map | ||
- shift: u8, // unit shift | ||
- length: u64, // synced length | ||
- alloc: u64, // synced space allocated | ||
- // os: *ObjectSet, // objset for this map | ||
- object: u64, // object id for this map | ||
- blksz: u32, // block size for space map | ||
- // dbuf: *dmu_dbuf_t, // space_map_phys_t dbuf | ||
- phys: SpaceMapPhys, // on-disk space map | ||
-} | ||
- | ||
-impl SpaceMap { | ||
- /// Returns SpaceMapPhys, Dbuf, and block size | ||
- // TODO | ||
- // fn open_impl(os: &mut ObjectSet, object: u64) -> zfs::Result<(SpaceMapPhys, dmu::Dbuf, u64)> { | ||
- // let dbuf = try!(dmu_bonus_hold(os, object, sm)); | ||
- // | ||
- // let (block_size, num_blocks) = dmu_object_size_from_db(dbuf); | ||
- // let phys = SpaceMapPhys::from_bytes(dbuf.data); | ||
- // | ||
- // Ok((phys, dbuf, block_size)) | ||
- // } | ||
- | ||
- | ||
- pub fn open(os: &mut ObjectSet, | ||
- object: u64, | ||
- start: u64, | ||
- size: u64, | ||
- shift: u8) | ||
- -> zfs::Result<Self> { | ||
- assert!(object != 0); | ||
- | ||
- // TODO | ||
- // let (phys, dbuf, block_size) = try!(Self::open_impl(os, object)); | ||
- let phys = SpaceMapPhys { | ||
- object: 0, // on-disk space map object | ||
- objsize: 0, // size of the object | ||
- alloc: 0, // space allocated from the map | ||
- }; | ||
- let block_size = 0; | ||
- | ||
- let mut space_map = SpaceMap { | ||
- start: start, | ||
- size: size, | ||
- shift: shift, | ||
- // os: os, | ||
- object: object, | ||
- length: 0, | ||
- alloc: 0, | ||
- blksz: block_size, | ||
- // dbuf: dbuf, | ||
- phys: phys, | ||
- }; | ||
- | ||
- Ok(space_map) | ||
- } | ||
- | ||
- pub fn load_avl(&self, | ||
- tree: &mut avl::Tree<Segment, u64>, | ||
- bytes: &[u8], | ||
- map_type: MapType) | ||
- -> Result<(), String> { | ||
- for i in 0..(self.size as usize) { | ||
- let entry = Entry::from_bytes(&bytes[i * mem::size_of::<Entry>()..]).unwrap(); | ||
- let entry_map_type = match entry.map_type() { | ||
- Some(map_type) => map_type, | ||
- None => { | ||
- return Err("Invalid map type".to_string()); | ||
- } | ||
- }; | ||
- if entry.debug() != 1 && entry_map_type == map_type { | ||
- // it's not a debug entry and it's the right map type, add it to the tree | ||
- tree.insert(Segment::from_entry(&entry)); | ||
- } | ||
- } | ||
- tree.in_order(|node| { | ||
- println!("{:?}", node.value()); | ||
- }); | ||
- | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-#[derive(Copy, Clone, Debug, PartialEq)] | ||
-pub enum MapType { | ||
- Alloc = 0, | ||
- Free = 1, | ||
-} | ||
- | ||
-impl MapType { | ||
- pub fn from_u64(u: u64) -> Option<Self> { | ||
- match u { | ||
- 0 => Some(MapType::Alloc), | ||
- 1 => Some(MapType::Free), | ||
- _ => None, | ||
- } | ||
- } | ||
-} | ||
- | ||
-#[derive(Copy, Clone)] | ||
-pub struct Entry(u64); | ||
- | ||
-impl FromBytes for Entry {} | ||
- | ||
-impl Entry { | ||
- pub fn debug(&self) -> u64 { | ||
- (self.0 >> 63) & 0x1 // 1 bit long | ||
- } | ||
- | ||
- // Non-debug entries | ||
- | ||
- pub fn size(&self) -> u64 { | ||
- self.0 & 0x7FFF // 15 bits long | ||
- } | ||
- | ||
- pub fn map_type(&self) -> Option<MapType> { | ||
- MapType::from_u64((self.0 >> 15) & 0x1) // 1 bit long | ||
- } | ||
- | ||
- pub fn offset(&self) -> u64 { | ||
- (self.0 >> 16) & 0x7FFFFFFFFFFF // 47 bytes long | ||
- } | ||
- | ||
- // Debug entries | ||
- | ||
- pub fn action(&self) -> u64 { | ||
- (self.0 >> 60) & 0x7 // 3 bits long | ||
- } | ||
- | ||
- pub fn sync_pass(&self) -> u64 { | ||
- (self.0 >> 50) & 0x3FF // 10 bits long | ||
- } | ||
- | ||
- pub fn txg(&self) -> u64 { | ||
- self.0 & 0x3FFFFFFFFFFFF // 50 bytes long | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for Entry { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- if self.debug() == 1 { | ||
- try!(write!(f, | ||
- "DEBUG: action:0x{:X} sync_pass:{:X} txg:0x{:X}", | ||
- self.action(), | ||
- self.sync_pass(), | ||
- self.txg())); | ||
- } else { | ||
- try!(write!(f, | ||
- "ENTRY: size:0x{:X} map_type:{:?} offset:0x{:X}", | ||
- self.size(), | ||
- self.map_type(), | ||
- self.offset())); | ||
- } | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-#[derive(Debug)] | ||
-pub struct Segment { | ||
- pub start: u64, | ||
- pub size: u64, | ||
-} | ||
- | ||
-impl Segment { | ||
- fn from_entry(entry: &Entry) -> Self { | ||
- Segment { | ||
- start: entry.offset(), | ||
- size: entry.size(), | ||
- } | ||
- } | ||
-} |
371
crates/zfs/taskq.rs
@@ -1,371 +0,0 @@ | ||
-use std::cmp; | ||
-// use std::collections::VecDeque; | ||
-// use std::sync::mpsc::{channel, Sender, Receiver}; | ||
-use std::thread; | ||
- | ||
-use super::zfs; | ||
- | ||
-const TQENT_FLAG_PREALLOC: u64 = 0x1; // taskq_dispatch_ent used | ||
- | ||
-const TASKQ_PREPOPULATE: u64 = 0x0001; | ||
-const TASKQ_CPR_SAFE: u64 = 0x0002; // Use CPR safe protocol | ||
-const TASKQ_DYNAMIC: u64 = 0x0004; // Use dynamic thread scheduling | ||
-const TASKQ_THREADS_CPU_PCT: u64 = 0x0008; // Scale # threads by # cpus | ||
-const TASKQ_DC_BATCH: u64 = 0x0010; // Mark threads as batch | ||
- | ||
-// const TQ_SLEEP: u64 = KM_SLEEP; // Can block for memory | ||
-// const TQ_NOSLEEP: u64 = KM_NOSLEEP; // Cannot block for memory; may fail | ||
-const TQ_NOQUEUE: u64 = 0x02; // Do not enqueue if can't dispatch | ||
-const TQ_FRONT: u64 = 0x08; // Queue in front | ||
- | ||
-const TASKQ_ACTIVE: u64 = 0x00010000; | ||
- | ||
-pub type TaskFn = Box<FnMut()>; | ||
- | ||
-pub struct Taskq { | ||
- name: String, | ||
- // kmutex_t lock, | ||
- // krwlock_t threadlock, | ||
- // kcondvar_t dispatch_cv, | ||
- // kcondvar_t wait_cv,*/ | ||
- // threads: Vec<Sender<Task>>, | ||
- flags: u64, | ||
- active: u16, | ||
- num_threads: u16, | ||
- num_alloc: u64, | ||
- min_alloc: u64, | ||
- max_alloc: u64, | ||
- next_task_id: usize, | ||
- // kcondvar_t max_alloc_cv, | ||
- max_alloc_wait: i64, /* taskq_ent_t *freelist, | ||
- * task_queue: VecDeque<Task>, */ | ||
-} | ||
- | ||
-impl Taskq { | ||
- pub fn new(name: String, | ||
- mut num_threads: u16, | ||
- min_alloc: u64, | ||
- max_alloc: u64, | ||
- flags: u64) | ||
- -> Self { | ||
- // taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); | ||
- | ||
- // if flags & TASKQ_THREADS_CPU_PCT != 0 { | ||
- // int pct; | ||
- // assert!(num_threads >= 0); | ||
- // assert!(num_threads <= 100); | ||
- // pct = cmp::min(num_threads, 100); | ||
- // pct = cmp::max(pct, 0); | ||
- // | ||
- // num_threads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; | ||
- // num_threads = cmp::max(num_threads, 1); /* need at least 1 thread */ | ||
- // } else { | ||
- // assert!(num_threads >= 1); | ||
- // } | ||
- | ||
- // rw_init(&tq.threadlock, NULL, RW_DEFAULT, NULL); | ||
- // mutex_init(&tq.lock, NULL, MUTEX_DEFAULT, NULL); | ||
- // cv_init(&tq.dispatch_cv, NULL, CV_DEFAULT, NULL); | ||
- // cv_init(&tq.wait_cv, NULL, CV_DEFAULT, NULL); | ||
- // cv_init(&tq.max_alloc_cv, NULL, CV_DEFAULT, NULL); | ||
- // tq.task.next: &tq.task; | ||
- // tq.task.prev: &tq.task; | ||
- | ||
- // if flags & TASKQ_PREPOPULATE != 0 { | ||
- // mutex_enter(&tq.lock); | ||
- // while (min_alloc-- > 0) | ||
- // task_free(tq, task_alloc(tq, KM_SLEEP)); | ||
- // mutex_exit(&tq.lock); | ||
- // } | ||
- | ||
- // let mut threads = Vec::new(); | ||
- // for _ in 0..num_threads { | ||
- // let (task_t, task_r) = channel(); | ||
- // threads.push(task_t); | ||
- // thread::spawn(|| { taskq_thread(task_r) }); | ||
- // tq.thread_list[t] = thread_create(NULL, 0, taskq_thread, tq, TS_RUN, NULL, 0, pri); | ||
- // VERIFIY(tq.thread_list[t]); | ||
- // } | ||
- | ||
- Taskq { | ||
- name: name, | ||
- // threads: threads, | ||
- flags: flags | TASKQ_ACTIVE, | ||
- active: num_threads, | ||
- num_threads: num_threads, | ||
- num_alloc: 0, | ||
- min_alloc: min_alloc, | ||
- max_alloc: max_alloc, | ||
- next_task_id: 0, | ||
- max_alloc_wait: 0, // task_queue: VecDeque::new(), | ||
- } | ||
- } | ||
- | ||
- // fn alloc_task(&mut self, tqflags: u64) -> Self { | ||
- // taskq_ent_t *t; | ||
- // | ||
- // loop { | ||
- // if (t = self.freelist) != NULL && self.num_alloc >= self.min_alloc { | ||
- // There's a free Task in the free_list | ||
- // assert!(t.flags & TQENT_FLAG_PREALLOC == 0); | ||
- // self.freelist = t.next; | ||
- // } else { | ||
- // if (self.num_alloc >= self.max_alloc) { | ||
- // if tqflags & KM_SLEEP == 0 { | ||
- // return NULL; | ||
- // } | ||
- // | ||
- // We don't want to exceed max_alloc, but we can't | ||
- // wait for other tasks to complete (and thus free up | ||
- // task structures) without risking deadlock with | ||
- // the caller. So, we just delay for one second | ||
- // to throttle the allocation rate. If we have tasks | ||
- // complete before one second timeout expires then | ||
- // taskq_ent_free will signal us and we will | ||
- // immediately retry the allocation. | ||
- // self.max_alloc_wait += 1; | ||
- // let rv = cv_timedwait(&self.max_alloc_cv, &self.lock, ddi_get_lbolt() + hz); | ||
- // self.max_alloc_wait -= 1; | ||
- // if rv > 0 { | ||
- // continue; | ||
- // } | ||
- // } | ||
- // mutex_exit(&self.lock); | ||
- // | ||
- // t = kmem_alloc(sizeof (taskq_ent_t), tqflags); | ||
- // | ||
- // mutex_enter(&self.lock); | ||
- // if t != NULL { | ||
- // Make sure we start without any flags | ||
- // t.flags = 0; | ||
- // self.num_alloc++; | ||
- // } | ||
- // } | ||
- // | ||
- // break; | ||
- // } | ||
- // return t; | ||
- // } | ||
- | ||
- // fn task_free(taskq_t *tq, taskq_ent_t *t) { | ||
- // if (tq->tq_nalloc <= tq->tq_min_alloc) { | ||
- // t->tqent_next = tq->tq_freelist; | ||
- // tq->tq_freelist = t; | ||
- // } else { | ||
- // tq->tq_nalloc--; | ||
- // mutex_exit(&tq->tq_lock); | ||
- // kmem_free(t, sizeof (taskq_ent_t)); | ||
- // mutex_enter(&tq->tq_lock); | ||
- // } | ||
- // | ||
- // if (tq->tq_max_alloc_wait) { | ||
- // cv_signal(&tq->tq_max_alloc_cv); | ||
- // } | ||
- // } | ||
- | ||
- fn taskq_dispatch(&mut self, func: TaskFn, flags: u64) -> TaskId { | ||
- // self.threads[0].send(Task { func: func, flags: flags }); | ||
- let index = self.next_task_id; | ||
- self.next_task_id += 1; | ||
- TaskId(index) | ||
- } | ||
- | ||
- // fn taskq_dispatch(&mut self, func: TaskFn, flags: u64) -> TaskId { | ||
- // taskq_ent_t *t; | ||
- // | ||
- // if taskq_now { | ||
- // func(arg); | ||
- // return 1; | ||
- // } | ||
- // | ||
- // mutex_enter(&self.lock); | ||
- // assert!(self.flags & TASKQ_ACTIVE); | ||
- // if (t = self.alloc_task(tqflags)) == NULL { | ||
- // mutex_exit(&self.lock); | ||
- // return 0; | ||
- // } | ||
- // if tqflags & TQ_FRONT != 0 { | ||
- // t.next = self.task.next; | ||
- // t.prev = &self.task; | ||
- // } else { | ||
- // t.next = &self.task; | ||
- // t.prev = self.task.prev; | ||
- // } | ||
- // t.next.prev = t; | ||
- // t.prev.next = t; | ||
- // t.func = func; | ||
- // t.flags = 0; | ||
- // cv_signal(&self.dispatch_cv); | ||
- // mutex_exit(&self.lock); | ||
- // return 1; | ||
- // } | ||
- // | ||
- // taskqid_t | ||
- // taskq_dispatch_delay(taskq_t *tq, task_func_t func, uint_t tqflags, | ||
- // clock_t expire_time) | ||
- // { | ||
- // return 0; | ||
- // } | ||
- | ||
- // pub fn empty_ent(&self) -> bool { | ||
- // self.next == NULL | ||
- // } | ||
- | ||
- // fn taskq_init_ent(taskq_ent_t *t) { | ||
- // t.next = NULL; | ||
- // t.prev = NULL; | ||
- // t.func = NULL; | ||
- // t.flags = 0; | ||
- // } | ||
- | ||
- // fn taskq_dispatch_ent(taskq_t *tq, task_func_t func, uint_t flags, taskq_ent_t *t) { | ||
- // assert!(func != NULL); | ||
- // | ||
- // Mark it as a prealloc'd task. This is important | ||
- // to ensure that we don't free it later. | ||
- // t.flags |= TQENT_FLAG_PREALLOC; | ||
- // Enqueue the task to the underlying queue. | ||
- // mutex_enter(&tq.lock); | ||
- // | ||
- // if (flags & TQ_FRONT) { | ||
- // t.next = tq.task.next; | ||
- // t.prev = &tq.task; | ||
- // } else { | ||
- // t.next = &tq.task; | ||
- // t.prev = tq.task.prev; | ||
- // } | ||
- // t.next.prev = t; | ||
- // t.prev.next = t; | ||
- // t.func = func; | ||
- // cv_signal(&tq.dispatch_cv); | ||
- // mutex_exit(&tq.lock); | ||
- // } | ||
- | ||
- // fn wait(&self) { | ||
- // mutex_enter(&tq.lock); | ||
- // while tq.task.next != &tq.task || tq.active > 0 { | ||
- // cv_wait(&tq.wait_cv, &tq.lock); | ||
- // } | ||
- // mutex_exit(&tq.lock); | ||
- // } | ||
- // | ||
- // fn wait_id(&self, id: TaskId) { | ||
- // self.wait(); | ||
- // } | ||
- // | ||
- // fn wait_outstanding(&self, id: TaskId) { | ||
- // self.wait(); | ||
- // } | ||
- // | ||
- // fn destroy(&mut self) { | ||
- // int num_threads = tq->tq_num_threads; | ||
- // | ||
- // taskq_wait(tq); | ||
- // | ||
- // mutex_enter(&tq->tq_lock); | ||
- // | ||
- // tq->tq_flags &= ~TASKQ_ACTIVE; | ||
- // cv_broadcast(&tq->tq_dispatch_cv); | ||
- // | ||
- // while tq->tq_num_threads > 0 { | ||
- // cv_wait(&tq->tq_wait_cv, &tq->tq_lock); | ||
- // } | ||
- // | ||
- // tq.min_alloc = 0; | ||
- // while (tq.num_alloc != 0) { | ||
- // ASSERT(tq->tq_freelist != NULL); | ||
- // task_free(tq, task_alloc(tq, KM_SLEEP)); | ||
- // } | ||
- // | ||
- // mutex_exit(&tq->tq_lock); | ||
- // | ||
- // kmem_free(tq->tq_thread_list, num_threads * sizeof (kthread_t *)); | ||
- // | ||
- // rw_destroy(&tq->tq_threadlock); | ||
- // mutex_destroy(&tq->tq_lock); | ||
- // cv_destroy(&tq->tq_dispatch_cv); | ||
- // cv_destroy(&tq->tq_wait_cv); | ||
- // cv_destroy(&tq->tq_max_alloc_cv); | ||
- // | ||
- // kmem_free(tq, sizeof (taskq_t)); | ||
- // } | ||
- // | ||
- // pub fn member(&self, thread_id: ThreadId) -> bool { | ||
- // for i in 0..self.num_threads { | ||
- // if self.thread_list[i] == t { | ||
- // return true; | ||
- // } | ||
- // } | ||
- // | ||
- // false | ||
- // } | ||
- | ||
- pub fn cancel_id(&mut self, id: TaskId) -> zfs::Result<()> { | ||
- Err(zfs::Error::NoEntity) | ||
- } | ||
-} | ||
- | ||
-// fn system_taskq_init() { | ||
-// system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, | ||
-// TASKQ_DYNAMIC | TASKQ_PREPOPULATE); | ||
-// } | ||
-// | ||
-// fn system_taskq_fini() { | ||
-// taskq_destroy(system_taskq); | ||
-// system_taskq = NULL; // defensive | ||
-// } | ||
- | ||
-//-------------------------------------------------------------------------------------------------// | ||
- | ||
-pub struct TaskId(usize); | ||
- | ||
-struct Task { | ||
- // taskq_ent *next; | ||
- // taskq_ent *prev; | ||
- func: Box<FnMut()>, | ||
- flags: u64, | ||
-} | ||
- | ||
-//-------------------------------------------------------------------------------------------------// | ||
- | ||
-// fn taskq_thread(task_r: Receiver<Task>) { | ||
-// while let Ok(task) = task_r.recv() { | ||
-// (task.func)(); | ||
-// } | ||
-// } | ||
- | ||
-// fn taskq_thread(task_r: Receiver<Task>) { | ||
-// taskq_t *tq = arg; | ||
-// taskq_ent_t *t; | ||
-// | ||
-// mutex_enter(&tq.lock); | ||
-// while tq.flags & TASKQ_ACTIVE != 0 { | ||
-// if (t = tq.task.next) == &tq.task { | ||
-// tq.active -= 1; | ||
-// if tq.active == 0 { | ||
-// cv_broadcast(&tq.wait_cv); | ||
-// } | ||
-// cv_wait(&tq.dispatch_cv, &tq.lock); | ||
-// tq.active++; | ||
-// continue; | ||
-// } | ||
-// t.prev.next = t.next; | ||
-// t.next.prev = t.prev; | ||
-// t.next = NULL; | ||
-// t.prev = NULL; | ||
-// mutex_exit(&tq.lock); | ||
-// | ||
-// rw_enter(&tq.threadlock, RW_READER); | ||
-// t.func(t.arg); | ||
-// rw_exit(&tq.threadlock); | ||
-// | ||
-// mutex_enter(&tq.lock); | ||
-// if !t.flags & TQENT_FLAG_PREALLOC != 0 { | ||
-// task_free(tq, t); | ||
-// } | ||
-// } | ||
-// tq.num_threads--; | ||
-// cv_broadcast(&tq.wait_cv); | ||
-// mutex_exit(&tq.lock); | ||
-// thread_exit(); | ||
-// } |
5
crates/zfs/txg.rs
@@ -1,5 +0,0 @@ | ||
-pub const DEFER_SIZE: usize = 2; | ||
- | ||
-pub const TXG_SIZE: usize = 4; | ||
- | ||
-pub const TXG_INITIAL: usize = TXG_SIZE; |
47
crates/zfs/uberblock.rs
@@ -1,47 +0,0 @@ | ||
-use std::{mem, ptr}; | ||
- | ||
-use super::from_bytes::FromBytes; | ||
-use super::block_ptr::BlockPtr; | ||
- | ||
-const UBERBLOCK_MAGIC: u64 = 0x00bab10c; // oo-ba-bloc! | ||
-pub const UBERBLOCK_SHIFT: u64 = 10; // up to 1K | ||
- | ||
-#[derive(Copy, Clone, Debug)] | ||
-#[repr(packed)] | ||
-pub struct Uberblock { | ||
- pub magic: u64, | ||
- pub version: u64, | ||
- pub txg: u64, | ||
- pub guid_sum: u64, | ||
- pub timestamp: u64, | ||
- pub rootbp: BlockPtr, | ||
-} | ||
- | ||
-impl Uberblock { | ||
- pub fn magic_little() -> u64 { | ||
- return 0x0cb1ba00; | ||
- } | ||
- | ||
- pub fn magic_big() -> u64 { | ||
- return 0x00bab10c; | ||
- } | ||
-} | ||
- | ||
-impl FromBytes for Uberblock { | ||
- fn from_bytes(data: &[u8]) -> Result<Self, String> { | ||
- if data.len() >= mem::size_of::<Uberblock>() { | ||
- let uberblock = unsafe { ptr::read(data.as_ptr() as *const Uberblock) }; | ||
- if uberblock.magic == Uberblock::magic_little() { | ||
- Ok(uberblock) | ||
- } else if uberblock.magic == Uberblock::magic_big() { | ||
- Ok(uberblock) | ||
- } else { | ||
- Err("Error: Invalid uberblock magic number".to_string()) | ||
- } | ||
- } else { | ||
- Err(format!("Error: Need {} bytes to read uberblock, only {} in buffer", | ||
- mem::size_of::<Uberblock>(), | ||
- data.len())) | ||
- } | ||
- } | ||
-} |
74
crates/zfs/util.rs
@@ -1,74 +0,0 @@ | ||
- | ||
-// Compatibility macros/typedefs needed for Solaris -> Linux port | ||
-pub fn p2_align(x: u64, align: u64) -> u64 { | ||
- x & -(align as i64) as u64 | ||
-} | ||
- | ||
-fn p2_cross(x: u64, y: u64, align: u64) -> bool { | ||
- x ^ y > align - 1 | ||
-} | ||
- | ||
-fn p2_round_up(x: u64, align: u64) -> u64 { | ||
- ((x - 1) | (align - 1)) + 1 | ||
-} | ||
- | ||
-fn p2_boundary(off: u64, len: u64, align: u64) -> bool { | ||
- (off ^ (off + len - 1)) > (align - 1) | ||
-} | ||
- | ||
-fn p2_phase(x: u64, align: u64) -> u64 { | ||
- x & (align - 1) | ||
-} | ||
- | ||
-fn p2_nphase(x: u64, align: u64) -> u64 { | ||
- -(x as i64) as u64 & (align - 1) | ||
-} | ||
- | ||
-fn p2_nphase_typed(x: u64, align: u64) -> u64 { | ||
- -(x as i64) as u64 & (align - 1) | ||
-} | ||
- | ||
-fn is_p2(x: u64) -> bool { | ||
- x & (x - 1) == 0 | ||
-} | ||
- | ||
-fn is_p2_aligned(v: u64, a: u64) -> bool { | ||
- v & (a - 1) == 0 | ||
-} | ||
- | ||
-pub fn highbit64(u: u64) -> u32 { | ||
- 63 - u.leading_zeros() | ||
-} | ||
- | ||
-// Typed version of the P2* macros. These macros should be used to ensure | ||
-// that the result is correctly calculated based on the data type of (x), | ||
-// which is passed in as the last argument, regardless of the data | ||
-// type of the alignment. For example, if (x) is of type uint64_t, | ||
-// and we want to round it up to a page boundary using "PAGESIZE" as | ||
-// the alignment, we can do either | ||
-// P2ROUNDUP(x, (uint64_t)PAGESIZE) | ||
-// or | ||
-// P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) | ||
-// | ||
-// #define P2ALIGN_TYPED(x, align, type) \ | ||
-// ((type)(x) & -(type)(align)) | ||
-// #define P2PHASE_TYPED(x, align, type) \ | ||
-// ((type)(x) & ((type)(align) - 1)) | ||
-// #define P2NPHASE_TYPED(x, align, type) \ | ||
-// (-(type)(x) & ((type)(align) - 1)) | ||
-// #define P2ROUNDUP_TYPED(x, align, type) \ | ||
-// ((((type)(x) - 1) | ((type)(align) - 1)) + 1) | ||
-// #define P2END_TYPED(x, align, type) \ | ||
-// (-(~(type)(x) & -(type)(align))) | ||
-// #define P2PHASEUP_TYPED(x, align, phase, type) \ | ||
-// ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) | ||
-// #define P2CROSS_TYPED(x, y, align, type) \ | ||
-// (((type)(x) ^ (type)(y)) > (type)(align) - 1) | ||
-// #define P2SAMEHIGHBIT_TYPED(x, y, type) \ | ||
-// (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) | ||
-// | ||
-// | ||
-// avoid any possibility of clashing with <stddef.h> version | ||
-// #if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) | ||
-// #define offsetof(s, m) ((size_t)(&(((s *)0)->m))) | ||
-// #endif |
506
crates/zfs/vdev.rs
@@ -1,506 +0,0 @@ | ||
-use std::{cmp, mem}; | ||
-use std::rc::Rc; | ||
- | ||
-use super::dmu_objset::ObjectSet; | ||
-use super::from_bytes::FromBytes; | ||
-use super::metaslab::{Metaslab, MetaslabClass, MetaslabGroup}; | ||
-use super::nvpair::{NvList, NvValue}; | ||
-use super::uberblock; | ||
-use super::util; | ||
-use super::vdev_file::VdevFile; | ||
-use super::zfs; | ||
- | ||
-#[repr(packed)] | ||
-pub struct VdevLabel { | ||
- pub blank: [u8; 8 * 1024], | ||
- pub boot_header: [u8; 8 * 1024], | ||
- pub nv_pairs: [u8; 112 * 1024], | ||
- pub uberblocks: [u8; 128 * 1024], | ||
-} | ||
- | ||
-impl FromBytes for VdevLabel {} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub trait IVdevOps { | ||
- /// Returns (size, max_size, ashift) | ||
- fn open(&mut self, vdev: &mut Vdev) -> zfs::Result<(u64, u64, u64)>; | ||
- | ||
- fn close(&mut self, vdev: &mut Vdev); | ||
- | ||
- /// Default asize function: return the MAX of psize with the asize of all children. This is | ||
- /// what's used by anything other than RAID-Z. | ||
- fn asize(&mut self, vdev: &mut Vdev, psize: u64) -> u64; | ||
- | ||
- fn hold(&mut self, vdev: &mut Vdev); | ||
- | ||
- fn release(&mut self, vdev: &mut Vdev); | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct VdevOps { | ||
- pub ops: Box<IVdevOps>, | ||
- // io_start: fn(&zio::Zio), | ||
- // io_done: fn(&zio::Zio), | ||
- // state_change: fn(), | ||
- vdev_type: String, | ||
- is_leaf: bool, | ||
-} | ||
- | ||
-impl VdevOps { | ||
- pub fn vdev_type(&self) -> &str { | ||
- self.vdev_type.as_ref() | ||
- } | ||
- pub fn is_leaf(&self) -> bool { | ||
- self.is_leaf | ||
- } | ||
-} | ||
- | ||
-fn load_ops(vdev_type: &str, nv: &NvList) -> zfs::Result<VdevOps> { | ||
- match vdev_type { | ||
- "disk" => { | ||
- Ok(VdevOps { | ||
- ops: Box::new(try!(VdevFile::load(nv))), | ||
- vdev_type: "disk".to_string(), | ||
- is_leaf: true, | ||
- }) | ||
- } | ||
- _ => Err(zfs::Error::Invalid), | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-#[derive(Copy, Clone, Debug, PartialEq)] | ||
-pub enum AllocType { | ||
- Load = 0, | ||
- Add, | ||
- Spare, | ||
- L2Cache, | ||
- RootPool, | ||
- Split, | ||
- Attach, | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-/// States are ordered from least to most healthy. | ||
-/// Vdevs `CannotOpen` and worse are considered unusable. | ||
-#[derive(Copy, Clone, Debug, PartialEq)] | ||
-pub enum State { | ||
- Unknown, // Uninitialized vdev | ||
- Closed, // Not currently open | ||
- Offline, // Not allowed to open | ||
- Removed, // Explicitly removed from the system | ||
- CannotOpen, // Tried top open, but failed | ||
- Faulted, // External request to fault device | ||
- Degraded, // Replicated vdev with unhealthy kids | ||
- Healthy, // Presumed good | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// Stuff that only top level vdevs have | ||
-pub struct Top { | ||
- pub ms_array: u64, // object ID of metaslab array in MOS | ||
- pub ms_shift: u64, // metaslab shift | ||
- pub ms_group: MetaslabGroup, // metaslab group | ||
- pub metaslabs: Vec<Metaslab>, // in-memory metaslab array | ||
- pub is_hole: bool, | ||
- pub removing: bool, // device is being removed? | ||
-} | ||
- | ||
-impl Top { | ||
- pub fn new(ms_array: u64, ms_shift: u64, ms_group: MetaslabGroup) -> Self { | ||
- Top { | ||
- ms_array: ms_array, | ||
- ms_shift: ms_shift, | ||
- ms_group: ms_group, | ||
- metaslabs: vec![], | ||
- is_hole: false, // TODO: zol checks vdev_ops for this, but idk what to do yet | ||
- removing: false, | ||
- } | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct Leaf { | ||
- whole_disk: u64, | ||
-} | ||
- | ||
-impl Leaf { | ||
- pub fn new() -> Self { | ||
- Leaf { whole_disk: 0 } | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// Note that a vdev can be a top-level, a leaf, both, or neither | ||
-pub struct Vdev { | ||
- id: u64, // child number in vdev parent | ||
- guid: u64, // unique ID for this vdev | ||
- guid_sum: u64, // self guid + all child guids | ||
- orig_guid: u64, // orig. guid prior to remove | ||
- asize: u64, // allocatable device capacity | ||
- min_asize: u64, // min acceptable asize | ||
- max_asize: u64, // max acceptable asize | ||
- pub ashift: u64, // block alignment shift | ||
- state: State, | ||
- prev_state: State, | ||
- pub ops: VdevOps, | ||
- parent: Option<TreeIndex>, | ||
- top_vdev: Option<TreeIndex>, | ||
- children: Vec<TreeIndex>, | ||
- create_txg: u64, // txg when top-level was added | ||
- | ||
- pub top: Option<Top>, | ||
- pub leaf: Option<Leaf>, | ||
-} | ||
- | ||
-impl Vdev { | ||
- pub fn new(id: u64, | ||
- guid: Option<u64>, | ||
- ashift: u64, | ||
- ops: VdevOps, | ||
- create_txg: u64, | ||
- vdev_top: Option<Top>) | ||
- -> Self { | ||
- let guid = guid.unwrap_or_else(|| { | ||
- // TODO: generate a guid | ||
- 0 | ||
- }); | ||
- | ||
- // TODO vdev_queue_init | ||
- | ||
- Vdev { | ||
- id: id, | ||
- guid: guid, | ||
- guid_sum: guid, // No children yet, so guid_sum is just my guid | ||
- orig_guid: 0, | ||
- asize: 0, | ||
- min_asize: 0, | ||
- max_asize: 0, | ||
- ashift: ashift, | ||
- state: State::Closed, | ||
- prev_state: State::Unknown, | ||
- ops: ops, | ||
- parent: None, | ||
- top_vdev: None, | ||
- children: Vec::new(), | ||
- create_txg: create_txg, | ||
- | ||
- top: vdev_top, | ||
- leaf: None, | ||
- } | ||
- } | ||
- | ||
- pub fn load(normal_class: &Rc<MetaslabClass>, | ||
- nv: &NvList, | ||
- id: u64, | ||
- parent: Option<TreeIndex>, | ||
- vdev_tree: &Tree, | ||
- alloc_type: AllocType) | ||
- -> zfs::Result<Self> { | ||
- let vdev_type = try!(nv.get::<&String>("type").ok_or(zfs::Error::Invalid)).clone(); | ||
- | ||
- let ops = try!(load_ops(vdev_type.as_ref(), nv)); | ||
- | ||
- if alloc_type == AllocType::Load { | ||
- // Verify the provided id matches the id written in the MOS | ||
- let label_id: u64 = try!(nv.get("id").ok_or(zfs::Error::Invalid)); | ||
- if label_id != id { | ||
- return Err(zfs::Error::Invalid); | ||
- } | ||
- } | ||
- | ||
- // If this is some sort of load, then we read the guid from the nvpairs. Otherwise, | ||
- // Vdev::new will generate one for us | ||
- let guid = match alloc_type { | ||
- AllocType::Load | AllocType::Spare | AllocType::L2Cache | AllocType::RootPool => { | ||
- Some(try!(nv.get("guid").ok_or(zfs::Error::Invalid))) | ||
- } | ||
- _ => None, | ||
- }; | ||
- | ||
- let create_txg = try!(nv.get("create_txg").ok_or(zfs::Error::Invalid)); | ||
- let ashift = try!(nv.get("ashift").ok_or(zfs::Error::Invalid)); | ||
- | ||
- let mut vdev_top = None; | ||
- | ||
- // If we're a top-level vdev, try to load the allocation parameters, | ||
- // create the metaslab group, and create the vdev::Top | ||
- if let Some(parent) = parent { | ||
- if parent.get(vdev_tree).parent.is_none() { | ||
- let mut ms_array = 0; | ||
- let mut ms_shift = 0; | ||
- if alloc_type == AllocType::Load || alloc_type == AllocType::Split { | ||
- ms_array = try!(nv.get("metaslab_array").ok_or(zfs::Error::Invalid)); | ||
- ms_shift = try!(nv.get("metaslab_shift").ok_or(zfs::Error::Invalid)); | ||
- // let asize = try!(nv.get("asize").ok_or(zfs::Error::Invalid)); | ||
- // let removing = try!(nv.get("removing").ok_or(zfs::Error::Invalid)); | ||
- } | ||
- | ||
- if alloc_type != AllocType::Attach { | ||
- assert!(alloc_type == AllocType::Load || alloc_type == AllocType::Add || | ||
- alloc_type == AllocType::Split || | ||
- alloc_type == AllocType::RootPool); | ||
- let ms_group = MetaslabGroup::create(normal_class.clone()); | ||
- | ||
- vdev_top = Some(Top::new(ms_array, ms_shift, ms_group)); | ||
- } | ||
- } | ||
- } | ||
- | ||
- let mut vdev = Self::new(id, guid, ashift, ops, create_txg, vdev_top); | ||
- vdev.parent = parent; | ||
- | ||
- Ok(vdev) | ||
- } | ||
- | ||
- fn open(&mut self) -> zfs::Result<()> { | ||
- Ok(()) | ||
- } | ||
- | ||
- fn metaslab_init(&mut self, mos: &mut ObjectSet, txg: u64) -> zfs::Result<()> { | ||
- // We assume this is a top-level vdev | ||
- let ref mut top = try!(self.top.as_mut().ok_or(zfs::Error::Invalid)); | ||
- | ||
- let old_count = top.metaslabs.len(); | ||
- let new_count = (self.asize >> top.ms_shift) as usize; | ||
- | ||
- // assert!(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); | ||
- | ||
- // Return if vdev isn't being allocated from yet | ||
- if top.ms_shift == 0 { | ||
- return Ok(()); | ||
- } | ||
- assert!(!top.is_hole); // Must not be a hole | ||
- | ||
- // Compute the raidz-deflation ratio. Note, we hard-code | ||
- // in 128k (1 << 17) because it is the "typical" blocksize. | ||
- // Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, | ||
- // otherwise it would inconsistently account for existing bp's. | ||
- // vd->vdev_deflate_ratio = (1 << 17) / (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); | ||
- | ||
- assert!(old_count <= new_count); | ||
- | ||
- for m in old_count..new_count { | ||
- let object: u64 = 0; | ||
- | ||
- if txg == 0 { | ||
- // try!(dmu_read(mos, top.ms_array, m * mem::size_of::<u64>(), | ||
- // mem::size_of::<u64>(), &object, DMU_READ_PREFETCH)); | ||
- } | ||
- | ||
- // let metaslab = try!(Metaslab::init(mos, self, m as u64, object, txg)); | ||
- // top.metaslabs.push(metaslab); | ||
- } | ||
- | ||
- // if (txg == 0) | ||
- // spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); | ||
- | ||
- // If the vdev is being removed we don't activate | ||
- // the metaslabs since we want to ensure that no new | ||
- // allocations are performed on this device. | ||
- if old_count == 0 && !top.removing { | ||
- // metaslab_group_activate(vd.mg); | ||
- } | ||
- | ||
- // if (txg == 0) | ||
- // spa_config_exit(spa, SCL_ALLOC, FTAG); | ||
- | ||
- Ok(()) | ||
- } | ||
- | ||
- // Get the minimum allocatable size. We define the allocatable size as | ||
- // the vdev's asize rounded to the nearest metaslab. This allows us to | ||
- // replace or attach devices which don't have the same physical size but | ||
- // can still satisfy the same number of allocations. | ||
- // fn get_min_asize(&self, parent: Option<&Vdev>) -> u64 { | ||
- // vdev_t *pvd = vd->vdev_parent; | ||
- // | ||
- // If our parent is NULL (inactive spare or cache) or is the root, | ||
- // just return our own asize. | ||
- // if self.parent.is_none() { | ||
- // return self.asize; | ||
- // } | ||
- // | ||
- // The top-level vdev just returns the allocatable size rounded | ||
- // to the nearest metaslab. | ||
- // if let Some(ref top) = self.top { | ||
- // return util::p2_align(self.asize, 1u64 << top.ms_shift); | ||
- // } | ||
- // | ||
- // The allocatable space for a raidz vdev is N * sizeof(smallest child), | ||
- // so each child must provide at least 1/Nth of its asize. | ||
- // if pvd->vdev_ops == &vdev_raidz_ops { | ||
- // return pvd->vdev_min_asize / pvd->vdev_children; | ||
- // } | ||
- // | ||
- // pvd->vdev_min_asize | ||
- // } | ||
- | ||
- | ||
- // pub fn dirty(&mut self, flags: u64, void *arg, txg: u64) { | ||
- // We assume this is a top-level vdev | ||
- // let ref top = self.top.unwrap(); | ||
- // | ||
- // assert!(self == self.top_vdev); | ||
- // assert!(!self.is_hole); | ||
- // assert!(util::is_p2(flags)); | ||
- // assert!(spa_writeable(self.spa)); | ||
- // | ||
- // if flags & DIRTY_METASLAB { | ||
- // txg_list_add(&self.ms_list, arg, txg); | ||
- // } | ||
- // | ||
- // if flags & DIRTY_DTL { | ||
- // txg_list_add(&self.dtl_list, arg, txg); | ||
- // } | ||
- // | ||
- // txg_list_add(&self.spa.vdev_txg_list, self, txg); | ||
- // } | ||
- | ||
- pub fn uberblock_shift(&self) -> u64 { | ||
- cmp::min(cmp::max(self.ashift, uberblock::UBERBLOCK_SHIFT), | ||
- MAX_UBERBLOCK_SHIFT) | ||
- } | ||
- | ||
- pub fn uberblock_count(&self) -> u64 { | ||
- UBERBLOCK_RING >> self.uberblock_shift() | ||
- } | ||
- | ||
- // pub fn uberblock_offset(&self, n) -> u64 { | ||
- // offsetof(vdev_label_t, vl_uberblock[n << self.uberblock_shift()]) | ||
- // } | ||
- | ||
- pub fn uberblock_size(&self) -> u64 { | ||
- 1 << self.uberblock_shift() | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
-#[derive(Copy, Clone, PartialEq)] | ||
-pub struct TreeIndex(usize); | ||
- | ||
-impl TreeIndex { | ||
- pub fn get<'a>(&self, tree: &'a Tree) -> &'a Vdev { | ||
- tree.nodes[self.0].as_ref().unwrap() | ||
- } | ||
- | ||
- pub fn get_mut<'a>(&self, tree: &'a mut Tree) -> &'a mut Vdev { | ||
- tree.nodes[self.0].as_mut().unwrap() | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-pub struct Tree { | ||
- nodes: Vec<Option<Vdev>>, | ||
- free: Vec<usize>, | ||
-} | ||
- | ||
-impl Tree { | ||
- pub fn new() -> Self { | ||
- Tree { | ||
- nodes: Vec::new(), | ||
- free: Vec::new(), | ||
- } | ||
- } | ||
- | ||
- pub fn add(&mut self, vdev: Vdev) -> TreeIndex { | ||
- let parent = vdev.parent; | ||
- let guid = vdev.guid; | ||
- | ||
- // Add the vdev node | ||
- let index = TreeIndex(match self.free.pop() { | ||
- Some(free_index) => { | ||
- self.nodes[free_index] = Some(vdev); | ||
- free_index | ||
- } | ||
- None => { | ||
- self.nodes.push(Some(vdev)); | ||
- self.nodes.len() - 1 | ||
- } | ||
- }); | ||
- | ||
- index.get_mut(self).top_vdev = parent.map(|parent| { | ||
- parent.get(self).top_vdev.unwrap_or(index) | ||
- }); | ||
- | ||
- if let Some(parent) = parent { | ||
- parent.get_mut(self).guid_sum += guid; | ||
- parent.get_mut(self).children.push(index); | ||
- } | ||
- | ||
- index | ||
- } | ||
- | ||
- pub fn parse(&mut self, | ||
- normal_class: &Rc<MetaslabClass>, | ||
- nv: &NvList, | ||
- parent: Option<TreeIndex>, | ||
- alloc_type: AllocType) | ||
- -> zfs::Result<TreeIndex> { | ||
- let vdev = try!(Vdev::load(normal_class, nv, 0, parent, self, alloc_type)); | ||
- let index = self.add(vdev); | ||
- | ||
- // Done parsing if this is a leaf | ||
- if index.get(self).ops.is_leaf() { | ||
- return Ok(index); | ||
- } | ||
- | ||
- // Get the vdev's children | ||
- let children: &Vec<NvList> = try!(nv.get("children").ok_or(zfs::Error::Invalid)); | ||
- | ||
- for child in children { | ||
- self.parse(normal_class, child, Some(index), alloc_type); | ||
- } | ||
- | ||
- Ok(index) | ||
- } | ||
- | ||
- pub fn load(&mut self, mos: &mut ObjectSet, root: TreeIndex) { | ||
- // We use an iterative solution because of borrowing issues | ||
- let mut queue = vec![root]; | ||
- | ||
- while let Some(index) = queue.pop() { | ||
- let vdev = index.get_mut(self); | ||
- | ||
- // Recursively load all children | ||
- for child in &vdev.children { | ||
- queue.push(*child); | ||
- } | ||
- | ||
- // Load metaslabs for top-level vdevs | ||
- // if let Some(ref top) = vdev.top { | ||
- if vdev.top.is_some() { | ||
- // if !top.is_hole { | ||
- if vdev.ashift == 0 || vdev.asize == 0 || vdev.metaslab_init(mos, 0).is_err() { | ||
- // TODO: Set vdev state to error | ||
- } | ||
- // } | ||
- } | ||
- | ||
- // TODO: Load DTL for leaf vdevs | ||
- } | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-const DIRTY_METASLAB: u64 = 0x01; | ||
-const DIRTY_DTL: u64 = 0x02; | ||
- | ||
-const RAIDZ_MAXPARITY: usize = 3; | ||
- | ||
-const PAD_SIZE: u64 = 8 << 10; | ||
-// 2 padding areas (vl_pad1 and vl_pad2) to skip | ||
-const SKIP_SIZE: u64 = PAD_SIZE * 2; | ||
-const PHYS_SIZE: u64 = 112 << 10; | ||
-const UBERBLOCK_RING: u64 = 128 << 10; | ||
- | ||
-// The largest uberblock we support is 8k. | ||
-const MAX_UBERBLOCK_SHIFT: u64 = 13; |
34
crates/zfs/vdev_file.rs
@@ -1,34 +0,0 @@ | ||
-use super::nvpair::NvList; | ||
-use super::{vdev, zfs}; | ||
- | ||
-pub struct VdevFile { | ||
- path: String, | ||
-} | ||
- | ||
-impl VdevFile { | ||
- pub fn load(nv: &NvList) -> zfs::Result<Self> { | ||
- Ok(VdevFile { path: try!(nv.get::<&String>("path").ok_or(zfs::Error::Invalid)).clone() }) | ||
- } | ||
- | ||
- // pub fn io_start(zio: &zio::Zio); | ||
- | ||
- // pub fn io_done(zio: &zio::Zio); | ||
- | ||
- // pub fn state_change(); | ||
-} | ||
- | ||
-impl vdev::IVdevOps for VdevFile { | ||
- fn open(&mut self, vdev: &mut vdev::Vdev) -> zfs::Result<(u64, u64, u64)> { | ||
- Ok((0, 0, 0)) | ||
- } | ||
- | ||
- fn close(&mut self, vdev: &mut vdev::Vdev) {} | ||
- | ||
- fn asize(&mut self, vdev: &mut vdev::Vdev, psize: u64) -> u64 { | ||
- 0 | ||
- } | ||
- | ||
- fn hold(&mut self, vdev: &mut vdev::Vdev) {} | ||
- | ||
- fn release(&mut self, vdev: &mut vdev::Vdev) {} | ||
-} |
1,011
crates/zfs/vdev_label.rs
@@ -1,1011 +0,0 @@ | ||
-use std::mem; | ||
- | ||
-use super::vdev::VdevLabel; | ||
- | ||
-// vdev_dirty() flags | ||
-const VDD_METASLAB: u64 = 0x01; | ||
-const VDD_DTL: u64 = 0x02; | ||
- | ||
-// Offset of embedded boot loader region on each label | ||
-const VDEV_BOOT_OFFSET: usize = 2 * mem::size_of::<VdevLabel>(); | ||
-// Size of embedded boot loader region on each label. | ||
-// The total size of the first two labels plus the boot area is 4MB. | ||
-const VDEV_BOOT_SIZE: usize = 7 << 19; // 3.5M | ||
- | ||
-// Size of label regions at the start and end of each leaf device. | ||
-const VDEV_LABEL_START_SIZE: usize = (2 * mem::size_of::<VdevLabel>() + VDEV_BOOT_SIZE); | ||
-const VDEV_LABEL_END_SIZE: usize = (2 * mem::size_of::<VdevLabel>()); | ||
-const VDEV_LABELS: u8 = 4; | ||
-const VDEV_BEST_LABEL: u8 = VDEV_LABELS; | ||
- | ||
-// Basic routines to read and write from a vdev label. | ||
-// Used throughout the rest of this file. | ||
-vdev_label_offset(psize: u64, l: u8, offset: u64) -> u64 { | ||
- assert!(offset < mem::size_of::<VdevLabel>()); | ||
- //assert!(P2PHASE_TYPED(psize, mem::size_of::<VdevLabel>(), u64) == 0); | ||
- | ||
- offset + (l as u64) * (mem::size_of::<VdevLabel>() as u64) + | ||
- if l < VDEV_LABELS / 2 { | ||
- 0 | ||
- } else { | ||
- psize - (VDEV_LABELS as u64) * (mem::size_of::<VdevLabel>() as u64) | ||
- } | ||
-} | ||
- | ||
-// Returns back the vdev label associated with the passed in offset. | ||
-vdev_label_number(psize: u64, offset: u64) -> Option<u64> { | ||
- if offset >= psize - VDEV_LABEL_END_SIZE { | ||
- offset -= psize - VDEV_LABEL_END_SIZE; | ||
- offset += ((VDEV_LABELS as u64) / 2) * (mem::size_of::<VdevLabel>() as u64); | ||
- } | ||
- let l = offset / (mem::size_of::<VdevLabel>() as u64); | ||
- if l < (VDEV_LABELS as u64) { | ||
- Some(l) | ||
- } else { | ||
- None | ||
- } | ||
-} | ||
- | ||
-fn vdev_label_read(zio_t *zio, vdev_t *vd, l: u8, void *buf, offset: u64, | ||
- size: u64, zio_done_func_t *done, void *private, flags: u64) { | ||
- //assert!(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); | ||
- //assert!(flags & ZIO_FLAG_CONFIG_WRITER); | ||
- | ||
- Zio::read_phys(zio, vd, vdev_label_offset(vd.psize, l, offset), | ||
- size, buf, ZIO_CHECKSUM_LABEL, done, private, | ||
- zio::Priority::SyncRead, flags, true).no_wait(); | ||
-} | ||
- | ||
-static void | ||
-vdev_label_write(zio_t *zio, vdev_t *vd, l: u8, void *buf, uint64_t offset, | ||
- uint64_t size, zio_done_func_t *done, void *private, int flags) | ||
-{ | ||
- assert!(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL || | ||
- (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) == | ||
- (SCL_CONFIG | SCL_STATE) && | ||
- dsl_pool_sync_context(spa_get_dsl(zio->io_spa)))); | ||
- assert!(flags & ZIO_FLAG_CONFIG_WRITER); | ||
- | ||
- zio.write_phys(vd, vdev_label_offset(vd->vdev_psize, l, offset), | ||
- size, buf, ZIO_CHECKSUM_LABEL, done, private, | ||
- ZIO_PRIORITY_SYNC_WRITE, flags, true).no_wait(); | ||
-} | ||
- | ||
-// Generate the nvlist representing this vdev's config. | ||
-fn vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vdev_config_flag_t flags) -> NvList { | ||
- let nv = NvList::new(0); | ||
- | ||
- nv.add("type".to_string(), NvValue::String(vd.ops.vdev_type)); | ||
- if !(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) { | ||
- nv.add("id".to_string(), NvValue::Uint64(vd.id)); | ||
- } | ||
- nv.add("guid".to_string(), NvValue::Uint64(vd.guid)); | ||
- | ||
- if (vd->vdev_path != NULL) | ||
- fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); | ||
- | ||
- if (vd->vdev_devid != NULL) | ||
- fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); | ||
- | ||
- if (vd->vdev_physpath != NULL) | ||
- fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, | ||
- vd->vdev_physpath); | ||
- | ||
- if (vd->vdev_fru != NULL) | ||
- fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); | ||
- | ||
- if (vd->vdev_nparity != 0) { | ||
- assert!(strcmp(vd->vdev_ops->vdev_op_type, | ||
- VDEV_TYPE_RAIDZ) == 0); | ||
- | ||
- // Make sure someone hasn't managed to sneak a fancy new vdev | ||
- // into a crufty old storage pool. | ||
- assert!(vd->vdev_nparity == 1 || | ||
- (vd->vdev_nparity <= 2 && | ||
- spa_version(spa) >= SPA_VERSION_RAIDZ2) || | ||
- (vd->vdev_nparity <= 3 && | ||
- spa_version(spa) >= SPA_VERSION_RAIDZ3)); | ||
- | ||
- // Note that we'll add the nparity tag even on storage pools | ||
- // that only support a single parity device -- older software | ||
- // will just ignore it. | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); | ||
- } | ||
- | ||
- if (vd->vdev_wholedisk != -1ULL) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, | ||
- vd->vdev_wholedisk); | ||
- | ||
- if (vd->vdev_not_present) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); | ||
- | ||
- if (vd->vdev_isspare) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); | ||
- | ||
- if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && | ||
- vd == vd->vdev_top) { | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, | ||
- vd->vdev_ms_array); | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, | ||
- vd->vdev_ms_shift); | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, | ||
- vd->vdev_asize); | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); | ||
- if (vd->vdev_removing) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, | ||
- vd->vdev_removing); | ||
- } | ||
- | ||
- if (vd->vdev_dtl_sm != NULL) { | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, | ||
- space_map_object(vd->vdev_dtl_sm)); | ||
- } | ||
- | ||
- if (vd->vdev_crtxg) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); | ||
- | ||
- if (getstats) { | ||
- vdev_stat_t vs; | ||
- pool_scan_stat_t ps; | ||
- | ||
- vdev_get_stats(vd, &vs); | ||
- fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, | ||
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)); | ||
- | ||
- // provide either current or previous scan information | ||
- if (spa_scan_get_stats(spa, &ps) == 0) { | ||
- fnvlist_add_uint64_array(nv, | ||
- ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, | ||
- sizeof (pool_scan_stat_t) / sizeof (uint64_t)); | ||
- } | ||
- } | ||
- | ||
- if (!vd->vdev_ops->vdev_op_leaf) { | ||
- nvlist_t **child; | ||
- int c, idx; | ||
- | ||
- assert!(!vd->vdev_ishole); | ||
- | ||
- child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), | ||
- KM_SLEEP); | ||
- | ||
- for (c = 0, idx = 0; c < vd->vdev_children; c++) { | ||
- vdev_t *cvd = vd->vdev_child[c]; | ||
- | ||
- // If we're generating an nvlist of removing | ||
- // vdevs then skip over any device which is | ||
- // not being removed. | ||
- if ((flags & VDEV_CONFIG_REMOVING) && | ||
- !cvd->vdev_removing) | ||
- continue; | ||
- | ||
- child[idx++] = vdev_config_generate(spa, cvd, | ||
- getstats, flags); | ||
- } | ||
- | ||
- if (idx) { | ||
- fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, | ||
- child, idx); | ||
- } | ||
- | ||
- for (c = 0; c < idx; c++) | ||
- nvlist_free(child[c]); | ||
- | ||
- kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); | ||
- | ||
- } else { | ||
- const char *aux = NULL; | ||
- | ||
- if (vd->vdev_offline && !vd->vdev_tmpoffline) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, true); | ||
- if (vd->vdev_resilver_txg != 0) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, | ||
- vd->vdev_resilver_txg); | ||
- if (vd->vdev_faulted) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, true); | ||
- if (vd->vdev_degraded) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, true); | ||
- if (vd->vdev_removed) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, true); | ||
- if (vd->vdev_unspare) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, true); | ||
- if (vd->vdev_ishole) | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, true); | ||
- | ||
- switch (vd->vdev_stat.vs_aux) { | ||
- case VDEV_AUX_ERR_EXCEEDED: | ||
- aux = "err_exceeded"; | ||
- break; | ||
- | ||
- case VDEV_AUX_EXTERNAL: | ||
- aux = "external"; | ||
- break; | ||
- } | ||
- | ||
- if (aux != NULL) | ||
- fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); | ||
- | ||
- if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { | ||
- fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, | ||
- vd->vdev_orig_guid); | ||
- } | ||
- } | ||
- | ||
- return (nv); | ||
-} | ||
- | ||
-// Generate a view of the top-level vdevs. If we currently have holes | ||
-// in the namespace, then generate an array which contains a list of holey | ||
-// vdevs. Additionally, add the number of top-level children that currently | ||
-// exist. | ||
-void | ||
-vdev_top_config_generate(spa_t *spa, nvlist_t *config) | ||
-{ | ||
- vdev_t *rvd = spa->spa_root_vdev; | ||
- uint64_t *array; | ||
- uint_t c, idx; | ||
- | ||
- array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); | ||
- | ||
- for (c = 0, idx = 0; c < rvd->vdev_children; c++) { | ||
- vdev_t *tvd = rvd->vdev_child[c]; | ||
- | ||
- if (tvd->vdev_ishole) | ||
- array[idx++] = c; | ||
- } | ||
- | ||
- if (idx) { | ||
- VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, | ||
- array, idx) == 0); | ||
- } | ||
- | ||
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, | ||
- rvd->vdev_children) == 0); | ||
- | ||
- kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); | ||
-} | ||
- | ||
-// Returns the configuration from the label of the given vdev. For vdevs | ||
-// which don't have a txg value stored on their label (i.e. spares/cache) | ||
-// or have not been completely initialized (txg = 0) just return | ||
-// the configuration from the first valid label we find. Otherwise, | ||
-// find the most up-to-date label that does not exceed the specified | ||
-// 'txg' value. | ||
-fn vdev_label_read_config(vdev_t *vd, uint64_t txg) -> NvList { | ||
- spa_t *spa = vd->vdev_spa; | ||
- nvlist_t *config = NULL; | ||
- vdev_phys_t *vp; | ||
- uint64_t best_txg = 0; | ||
- int error = 0; | ||
- int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | | ||
- ZIO_FLAG_SPECULATIVE; | ||
- int l; | ||
- | ||
- assert!(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); | ||
- | ||
- if (!vdev_readable(vd)) | ||
- return (NULL); | ||
- | ||
- vp = zio_buf_alloc(sizeof (vdev_phys_t)); | ||
- | ||
-retry: | ||
- for (l = 0; l < VDEV_LABELS; l++) { | ||
- nvlist_t *label = NULL; | ||
- | ||
- let zio = Zio::root(spa, None, None, flags); | ||
- | ||
- vdev_label_read(zio, vd, l, vp, | ||
- offsetof(vdev_label_t, vl_vdev_phys), | ||
- sizeof (vdev_phys_t), NULL, NULL, flags); | ||
- | ||
- if (zio_wait(zio) == 0 && | ||
- nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), | ||
- &label, 0) == 0) { | ||
- uint64_t label_txg = 0; | ||
- | ||
- // Auxiliary vdevs won't have txg values in their | ||
- // labels and newly added vdevs may not have been | ||
- // completely initialized so just return the | ||
- // configuration from the first valid label we | ||
- // encounter. | ||
- error = nvlist_lookup_uint64(label, | ||
- ZPOOL_CONFIG_POOL_TXG, &label_txg); | ||
- if ((error || label_txg == 0) && !config) { | ||
- config = label; | ||
- break; | ||
- } else if (label_txg <= txg && label_txg > best_txg) { | ||
- best_txg = label_txg; | ||
- nvlist_free(config); | ||
- config = fnvlist_dup(label); | ||
- } | ||
- } | ||
- | ||
- if (label != NULL) { | ||
- nvlist_free(label); | ||
- label = NULL; | ||
- } | ||
- } | ||
- | ||
- if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { | ||
- flags |= ZIO_FLAG_TRYHARD; | ||
- goto retry; | ||
- } | ||
- | ||
- zio_buf_free(vp, sizeof (vdev_phys_t)); | ||
- | ||
- return (config); | ||
-} | ||
- | ||
-// Determine if a device is in use. The 'spare_guid' parameter will be filled | ||
-// in with the device guid if this spare is active elsewhere on the system. | ||
-vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, | ||
- uint64_t *spare_guid, uint64_t *l2cache_guid) -> bool { | ||
- spa_t *spa = vd->vdev_spa; | ||
- uint64_t state, pool_guid, device_guid, txg, spare_pool; | ||
- uint64_t vdtxg = 0; | ||
- nvlist_t *label; | ||
- | ||
- if (spare_guid) | ||
- *spare_guid = 0ULL; | ||
- if (l2cache_guid) | ||
- *l2cache_guid = 0ULL; | ||
- | ||
- // Read the label, if any, and perform some basic sanity checks. | ||
- if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) | ||
- return (false); | ||
- | ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &vdtxg); | ||
- | ||
- if nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0 || | ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &device_guid) != 0 { | ||
- nvlist_free(label); | ||
- return (false); | ||
- } | ||
- | ||
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && | ||
- (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, | ||
- &pool_guid) != 0 || | ||
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, | ||
- &txg) != 0)) { | ||
- nvlist_free(label); | ||
- return (false); | ||
- } | ||
- | ||
- nvlist_free(label); | ||
- | ||
- // Check to see if this device indeed belongs to the pool it claims to | ||
- // be a part of. The only way this is allowed is if the device is a hot | ||
- // spare (which we check for later on). | ||
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && | ||
- !spa_guid_exists(pool_guid, device_guid) && | ||
- !spa_spare_exists(device_guid, NULL, NULL) && | ||
- !spa_l2cache_exists(device_guid, NULL)) | ||
- return (false); | ||
- | ||
- // If the transaction group is zero, then this an initialized (but | ||
- // unused) label. This is only an error if the create transaction | ||
- // on-disk is the same as the one we're using now, in which case the | ||
- // user has attempted to add the same vdev multiple times in the same | ||
- // transaction. | ||
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && | ||
- txg == 0 && vdtxg == crtxg) | ||
- return (true); | ||
- | ||
- // Check to see if this is a spare device. We do an explicit check for | ||
- // spa_has_spare() here because it may be on our pending list of spares | ||
- // to add. We also check if it is an l2cache device. | ||
- if (spa_spare_exists(device_guid, &spare_pool, NULL) || | ||
- spa_has_spare(spa, device_guid)) { | ||
- if (spare_guid) | ||
- *spare_guid = device_guid; | ||
- | ||
- switch (reason) { | ||
- case VDEV_LABEL_CREATE: | ||
- case VDEV_LABEL_L2CACHE: | ||
- return (true); | ||
- | ||
- case VDEV_LABEL_REPLACE: | ||
- return (!spa_has_spare(spa, device_guid) || | ||
- spare_pool != 0ULL); | ||
- | ||
- case VDEV_LABEL_SPARE: | ||
- return (spa_has_spare(spa, device_guid)); | ||
- default: | ||
- break; | ||
- } | ||
- } | ||
- | ||
- // Check to see if this is an l2cache device. | ||
- if (spa_l2cache_exists(device_guid, NULL)) | ||
- return true; | ||
- | ||
- // We can't rely on a pool's state if it's been imported | ||
- // read-only. Instead we look to see if the pools is marked | ||
- // read-only in the namespace and set the state to active. | ||
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && | ||
- (spa = spa_by_guid(pool_guid, device_guid)) != NULL && | ||
- spa_mode(spa) == FREAD) | ||
- state = POOL_STATE_ACTIVE; | ||
- | ||
- // If the device is marked ACTIVE, then this device is in use by another | ||
- // pool on the system. | ||
- return (state == POOL_STATE_ACTIVE); | ||
-} | ||
- | ||
-// Initialize a vdev label. We check to make sure each leaf device is not in | ||
-// use, and writable. We put down an initial label which we will later | ||
-// overwrite with a complete label. Note that it's important to do this | ||
-// sequentially, not in parallel, so that we catch cases of multiple use of the | ||
-// same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with | ||
-// itself. | ||
-int | ||
-vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) | ||
-{ | ||
- spa_t *spa = vd->vdev_spa; | ||
- nvlist_t *label; | ||
- vdev_phys_t *vp; | ||
- char *pad2; | ||
- uberblock_t *ub; | ||
- zio_t *zio; | ||
- char *buf; | ||
- size_t buflen; | ||
- int error; | ||
- uint64_t spare_guid = 0, l2cache_guid = 0; | ||
- int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; | ||
- int c, l; | ||
- vdev_t *pvd; | ||
- | ||
- assert!(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); | ||
- | ||
- for (c = 0; c < vd->vdev_children; c++) | ||
- if ((error = vdev_label_init(vd->vdev_child[c], | ||
- crtxg, reason)) != 0) | ||
- return (error); | ||
- | ||
- // Track the creation time for this vdev | ||
- vd->vdev_crtxg = crtxg; | ||
- | ||
- if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) | ||
- return (0); | ||
- | ||
- // Dead vdevs cannot be initialized. | ||
- if (vdev_is_dead(vd)) | ||
- return (SET_ERROR(EIO)); | ||
- | ||
- // Determine if the vdev is in use. | ||
- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && | ||
- vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) | ||
- return (SET_ERROR(EBUSY)); | ||
- | ||
- // If this is a request to add or replace a spare or l2cache device | ||
- // that is in use elsewhere on the system, then we must update the | ||
- // guid (which was initialized to a random value) to reflect the | ||
- // actual GUID (which is shared between multiple pools). | ||
- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && | ||
- spare_guid != 0ULL) { | ||
- uint64_t guid_delta = spare_guid - vd->vdev_guid; | ||
- | ||
- vd->vdev_guid += guid_delta; | ||
- | ||
- for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) | ||
- pvd->vdev_guid_sum += guid_delta; | ||
- | ||
- // If this is a replacement, then we want to fallthrough to the | ||
- // rest of the code. If we're adding a spare, then it's already | ||
- // labeled appropriately and we can just return. | ||
- if (reason == VDEV_LABEL_SPARE) | ||
- return (0); | ||
- assert!(reason == VDEV_LABEL_REPLACE || | ||
- reason == VDEV_LABEL_SPLIT); | ||
- } | ||
- | ||
- if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && | ||
- l2cache_guid != 0ULL) { | ||
- uint64_t guid_delta = l2cache_guid - vd->vdev_guid; | ||
- | ||
- vd->vdev_guid += guid_delta; | ||
- | ||
- for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) | ||
- pvd->vdev_guid_sum += guid_delta; | ||
- | ||
- // If this is a replacement, then we want to fallthrough to the | ||
- // rest of the code. If we're adding an l2cache, then it's | ||
- // already labeled appropriately and we can just return. | ||
- if (reason == VDEV_LABEL_L2CACHE) | ||
- return (0); | ||
- assert!(reason == VDEV_LABEL_REPLACE); | ||
- } | ||
- | ||
- // Initialize its label. | ||
- vp = zio_buf_alloc(sizeof (vdev_phys_t)); | ||
- bzero(vp, sizeof (vdev_phys_t)); | ||
- | ||
- // Generate a label describing the pool and our top-level vdev. | ||
- // We mark it as being from txg 0 to indicate that it's not | ||
- // really part of an active pool just yet. The labels will | ||
- // be written again with a meaningful txg by spa_sync(). | ||
- if (reason == VDEV_LABEL_SPARE || | ||
- // For inactive hot spares, we generate a special label that | ||
- // identifies as a mutually shared hot spare. We write the | ||
- // label if we are adding a hot spare, or if we are removing an | ||
- // active hot spare (in which case we want to revert the | ||
- // labels). | ||
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); | ||
- | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, | ||
- spa_version(spa)) == 0); | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, | ||
- POOL_STATE_SPARE) == 0); | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, | ||
- vd->vdev_guid) == 0); | ||
- } else if (reason == VDEV_LABEL_L2CACHE || | ||
- (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { | ||
- // For level 2 ARC devices, add a special label. | ||
- VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); | ||
- | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, | ||
- spa_version(spa)) == 0); | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, | ||
- POOL_STATE_L2CACHE) == 0); | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, | ||
- vd->vdev_guid) == 0); | ||
- } else { | ||
- uint64_t txg = 0ULL; | ||
- | ||
- if (reason == VDEV_LABEL_SPLIT) | ||
- txg = spa->spa_uberblock.ub_txg; | ||
- label = spa_config_generate(spa, vd, txg, false); | ||
- | ||
- // Add our creation time. This allows us to detect multiple | ||
- // vdev uses as described above, and automatically expires if we | ||
- // fail. | ||
- VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, | ||
- crtxg) == 0); | ||
- } | ||
- | ||
- buf = vp->vp_nvlist; | ||
- buflen = sizeof (vp->vp_nvlist); | ||
- | ||
- error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); | ||
- if (error != 0) { | ||
- nvlist_free(label); | ||
- zio_buf_free(vp, sizeof (vdev_phys_t)); | ||
- /* EFAULT means nvlist_pack ran out of room */ | ||
- return (error == EFAULT ? ENAMETOOLONG : EINVAL); | ||
- } | ||
- | ||
- // Initialize uberblock template. | ||
- ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); | ||
- bzero(ub, VDEV_UBERBLOCK_RING); | ||
- *ub = spa->spa_uberblock; | ||
- ub->ub_txg = 0; | ||
- | ||
- // Initialize the 2nd padding area. | ||
- pad2 = zio_buf_alloc(VDEV_PAD_SIZE); | ||
- bzero(pad2, VDEV_PAD_SIZE); | ||
- | ||
- // Write everything in parallel. | ||
-retry: | ||
- zio = zio_root(spa, NULL, NULL, flags); | ||
- | ||
- for (l = 0; l < VDEV_LABELS; l++) { | ||
- | ||
- vdev_label_write(zio, vd, l, vp, | ||
- offsetof(vdev_label_t, vl_vdev_phys), | ||
- sizeof (vdev_phys_t), NULL, NULL, flags); | ||
- | ||
- // Skip the 1st padding area. | ||
- // Zero out the 2nd padding area where it might have | ||
- // left over data from previous filesystem format. | ||
- vdev_label_write(zio, vd, l, pad2, | ||
- offsetof(vdev_label_t, vl_pad2), | ||
- VDEV_PAD_SIZE, NULL, NULL, flags); | ||
- | ||
- vdev_label_write(zio, vd, l, ub, | ||
- offsetof(vdev_label_t, vl_uberblock), | ||
- VDEV_UBERBLOCK_RING, NULL, NULL, flags); | ||
- } | ||
- | ||
- error = zio_wait(zio); | ||
- | ||
- if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { | ||
- flags |= ZIO_FLAG_TRYHARD; | ||
- goto retry; | ||
- } | ||
- | ||
- nvlist_free(label); | ||
- zio_buf_free(pad2, VDEV_PAD_SIZE); | ||
- zio_buf_free(ub, VDEV_UBERBLOCK_RING); | ||
- zio_buf_free(vp, sizeof (vdev_phys_t)); | ||
- | ||
- // If this vdev hasn't been previously identified as a spare, then we | ||
- // mark it as such only if a) we are labeling it as a spare, or b) it | ||
- // exists as a spare elsewhere in the system. Do the same for | ||
- // level 2 ARC devices. | ||
- if (error == 0 && !vd->vdev_isspare && | ||
- (reason == VDEV_LABEL_SPARE || | ||
- spa_spare_exists(vd->vdev_guid, NULL, NULL))) | ||
- spa_spare_add(vd); | ||
- | ||
- if (error == 0 && !vd->vdev_isl2cache && | ||
- (reason == VDEV_LABEL_L2CACHE || | ||
- spa_l2cache_exists(vd->vdev_guid, NULL))) | ||
- spa_l2cache_add(vd); | ||
- | ||
- return (error); | ||
-} | ||
- | ||
-// ========================================================================== | ||
-// uberblock load/sync | ||
-// ========================================================================== | ||
- | ||
-// Consider the following situation: txg is safely synced to disk. We've | ||
-// written the first uberblock for txg + 1, and then we lose power. When we | ||
-// come back up, we fail to see the uberblock for txg + 1 because, say, | ||
-// it was on a mirrored device and the replica to which we wrote txg + 1 | ||
-// is now offline. If we then make some changes and sync txg + 1, and then | ||
-// the missing replica comes back, then for a few seconds we'll have two | ||
-// conflicting uberblocks on disk with the same txg. The solution is simple: | ||
-// among uberblocks with equal txg, choose the one with the latest timestamp. | ||
-fn uberblock_compare(a: &Uberblock, b: &Uberblock) -> i64 { | ||
- if a.txg < b.txg { | ||
- return -1; | ||
- } | ||
- if a.txg > b.txg { | ||
- return 1; | ||
- } | ||
- | ||
- if a.timestamp < b.timestamp { | ||
- return -1; | ||
- } | ||
- if a.timestamp > b.timestamp { | ||
- return 1; | ||
- } | ||
- | ||
- 0 | ||
-} | ||
- | ||
-struct ubl_cbdata { | ||
- uberblock_t *ubl_ubbest; /* Best uberblock */ | ||
- vdev_t *ubl_vd; /* vdev associated with the above */ | ||
-}; | ||
- | ||
-fn uberblock_load_done(zt *zio) { | ||
- vdev_t *vd = zio->vd; | ||
- spa_t *spa = zio->spa; | ||
- zt *rio = zio->private; | ||
- uberblock_t *ub = zio->data; | ||
- struct ubl_cbdata *cbp = rio->private; | ||
- | ||
- //assert!(zio.size == VDEV_UBERBLOCK_SIZE(vd)); | ||
- | ||
- if (zio->error == 0 && uberblock_verify(ub) == 0) { | ||
- mutex_enter(&rio->lock); | ||
- if (ub->ub_txg <= spa->spa_load_max_txg && | ||
- uberblock_compare(ub, cbp->ubl_ubbest) > 0) { | ||
- // Keep track of the vdev in which this uberblock | ||
- // was found. We will use this information later | ||
- // to obtain the config nvlist associated with | ||
- // this uberblock. | ||
- *cbp->ubl_ubbest = *ub; | ||
- cbp->ubl_vd = vd; | ||
- } | ||
- mutex_exit(&rio->lock); | ||
- } | ||
- | ||
- zbuf_free(zio->data, zio->size); | ||
-} | ||
- | ||
-fn uberblock_load_impl(zio: &Zio, vdev_t *vd, int flags, struct ubl_cbdata *cbp) { | ||
- for c in 0..vd->vdev_children { | ||
- uberblock_load_impl(zio, vd.vdev_child[c], flags, cbp); | ||
- } | ||
- | ||
- if vd.ops.vdev_op_leaf && vdev_readable(vd) { | ||
- for l in 0..VDEV_LABELS { | ||
- for n in 0..VDEV_UBERBLOCK_COUNT(vd) { | ||
- vdev_label_read(zio, vd, l, zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)), | ||
- VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), | ||
- vdev_uberblock_load_done, zio, flags); | ||
- } | ||
- } | ||
- } | ||
-} | ||
- | ||
-// Reads the 'best' uberblock from disk along with its associated | ||
-// configuration. First, we read the uberblock array of each label of each | ||
-// vdev, keeping track of the uberblock with the highest txg in each array. | ||
-// Then, we read the configuration from the same vdev as the best uberblock. | ||
-fn uberblock_load(vdev_t *rvd, ub: &Uberblock, nvlist_t **config) -> Option<Uberblock> { | ||
- spa_t *spa = rvd->vdev_spa; | ||
- struct ubl_cbdata cb; | ||
- let flags = zio::FLAG_CONFIG_WRITER | zio::FLAG_CANFAIL | | ||
- zio::FLAG_SPECULATIVE | zio::FLAG_TRYHARD; | ||
- | ||
- assert!(ub); | ||
- assert!(config); | ||
- | ||
- bzero(ub, sizeof (uberblock_t)); | ||
- *config = NULL; | ||
- | ||
- cb.ubl_ubbest = ub; | ||
- cb.ubl_vd = NULL; | ||
- | ||
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); | ||
- let zio = Zio::root(spa, None, &cb, flags); | ||
- uberblock_load_impl(zio, rvd, flags, &cb); | ||
- zio.wait(); | ||
- | ||
- // It's possible that the best uberblock was discovered on a label | ||
- // that has a configuration which was written in a future txg. | ||
- // Search all labels on this vdev to find the configuration that | ||
- // matches the txg for our uberblock. | ||
- if (cb.ubl_vd != NULL) | ||
- *config = label_read_config(cb.ubl_vd, ub->ub_txg); | ||
- spa_config_exit(spa, SCL_ALL, FTAG); | ||
-} | ||
- | ||
-// On success, increment root zio's count of good writes. | ||
-// We only get credit for writes to known-visible vdevs; see spa_vdev_add(). | ||
-fn vdev_uberblock_sync_done(zio_t *zio) { | ||
- uint64_t *good_writes = zio->io_private; | ||
- | ||
- if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) | ||
- atomic_add_64(good_writes, 1); | ||
-} | ||
- | ||
-// Write the uberblock to all labels of all leaves of the specified vdev. | ||
-fn vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) { | ||
- uberblock_t *ubbuf; | ||
- int c, l, n; | ||
- | ||
- for (c = 0; c < vd->vdev_children; c++) { | ||
- vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags); | ||
- } | ||
- | ||
- if !vd->vdev_ops->vdev_op_leaf { | ||
- return; | ||
- } | ||
- | ||
- if !vdev_writeable(vd) { | ||
- return; | ||
- } | ||
- | ||
- n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1); | ||
- | ||
- ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); | ||
- bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); | ||
- *ubbuf = *ub; | ||
- | ||
- for (l = 0; l < VDEV_LABELS; l++) { | ||
- vdev_label_write(zio, vd, l, ubbuf, | ||
- VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), | ||
- vdev_uberblock_sync_done, zio->io_private, | ||
- flags | ZIO_FLAG_DONT_PROPAGATE); | ||
- } | ||
- | ||
- zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); | ||
-} | ||
- | ||
-// Sync the uberblocks to all vdevs in svd[] | ||
-fn vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) -> zfs::Result<()> { | ||
- spa_t *spa = svd[0]->vdev_spa; | ||
- zio_t *zio; | ||
- uint64_t good_writes = 0; | ||
- int v; | ||
- | ||
- zio = zio_root(spa, NULL, &good_writes, flags); | ||
- | ||
- for (v = 0; v < svdcount; v++) | ||
- vdev_uberblock_sync(zio, ub, svd[v], flags); | ||
- | ||
- (void) zio_wait(zio); | ||
- | ||
- // Flush the uberblocks to disk. This ensures that the odd labels | ||
- // are no longer needed (because the new uberblocks and the even | ||
- // labels are safely on disk), so it is safe to overwrite them. | ||
- zio = zio_root(spa, NULL, NULL, flags); | ||
- | ||
- for (v = 0; v < svdcount; v++) | ||
- zio_flush(zio, svd[v]); | ||
- | ||
- (void) zio_wait(zio); | ||
- | ||
- return (good_writes >= 1 ? 0 : EIO); | ||
-} | ||
- | ||
-// On success, increment the count of good writes for our top-level vdev. | ||
-fn vdev_label_sync_done(zio_t *zio) { | ||
- uint64_t *good_writes = zio->io_private; | ||
- | ||
- if (zio->io_error == 0) | ||
- atomic_add_64(good_writes, 1); | ||
-} | ||
- | ||
-// If there weren't enough good writes, indicate failure to the parent. | ||
-fn vdev_label_sync_top_done(zio_t *zio) { | ||
- uint64_t *good_writes = zio->io_private; | ||
- | ||
- if (*good_writes == 0) | ||
- zio->io_error = SET_ERROR(EIO); | ||
- | ||
- kmem_free(good_writes, sizeof (uint64_t)); | ||
-} | ||
- | ||
-// We ignore errors for log and cache devices, simply free the private data. | ||
-fn vdev_label_sync_ignore_done(zio_t *zio) { | ||
- kmem_free(zio->io_private, sizeof (uint64_t)); | ||
-} | ||
- | ||
-// Write all even or odd labels to all leaves of the specified vdev. | ||
-fn vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags) { | ||
- nvlist_t *label; | ||
- vdev_phys_t *vp; | ||
- char *buf; | ||
- size_t buflen; | ||
- int c; | ||
- | ||
- for (c = 0; c < vd->vdev_children; c++) | ||
- vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags); | ||
- | ||
- if (!vd->vdev_ops->vdev_op_leaf) | ||
- return; | ||
- | ||
- if (!vdev_writeable(vd)) | ||
- return; | ||
- | ||
- // Generate a label describing the top-level config to which we belong. | ||
- label = spa_config_generate(vd->vdev_spa, vd, txg, false); | ||
- | ||
- vp = zio_buf_alloc(sizeof (vdev_phys_t)); | ||
- bzero(vp, sizeof (vdev_phys_t)); | ||
- | ||
- buf = vp->vp_nvlist; | ||
- buflen = sizeof (vp->vp_nvlist); | ||
- | ||
- if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { | ||
- for (; l < VDEV_LABELS; l += 2) { | ||
- vdev_label_write(zio, vd, l, vp, | ||
- offsetof(vdev_label_t, vl_vdev_phys), | ||
- sizeof (vdev_phys_t), | ||
- vdev_label_sync_done, zio->io_private, | ||
- flags | ZIO_FLAG_DONT_PROPAGATE); | ||
- } | ||
- } | ||
- | ||
- zio_buf_free(vp, sizeof (vdev_phys_t)); | ||
- nvlist_free(label); | ||
-} | ||
- | ||
-fn vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) -> zfs::Result<()> { | ||
- list_t *dl = &spa->spa_config_dirty_list; | ||
- vdev_t *vd; | ||
- zio_t *zio; | ||
- int error; | ||
- | ||
- // Write the new labels to disk. | ||
- zio = zio_root(spa, NULL, NULL, flags); | ||
- | ||
- for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { | ||
- uint64_t *good_writes; | ||
- zio_t *vio; | ||
- | ||
- assert!(!vd->vdev_ishole); | ||
- | ||
- good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); | ||
- vio = zio_null(zio, spa, NULL, | ||
- (vd->vdev_islog || vd->vdev_aux != NULL) ? | ||
- vdev_label_sync_ignore_done : vdev_label_sync_top_done, | ||
- good_writes, flags); | ||
- vdev_label_sync(vio, vd, l, txg, flags); | ||
- vio.no_wait(); | ||
- } | ||
- | ||
- error = zio.wait(); | ||
- | ||
- // Flush the new labels to disk. | ||
- zio = zio.root(spa, None, None, flags); | ||
- | ||
- for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { | ||
- zio.flush(vd); | ||
- } | ||
- | ||
- zio.wait(); | ||
- | ||
- return (error); | ||
-} | ||
- | ||
-// Sync the uberblock and any changes to the vdev configuration. | ||
-// | ||
-// The order of operations is carefully crafted to ensure that | ||
-// if the system panics or loses power at any time, the state on disk | ||
-// is still transactionally consistent. The in-line comments below | ||
-// describe the failure semantics at each stage. | ||
-// | ||
-// Moreover, vdev_config_sync() is designed to be idempotent: if it fails | ||
-// at any time, you can just call it again, and it will resume its work. | ||
-fn config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) -> zfs::Result<()> { | ||
- spa_t *spa = svd[0]->vdev_spa; | ||
- uberblock_t *ub = &spa->spa_uberblock; | ||
- vdev_t *vd; | ||
- int error; | ||
- int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; | ||
- | ||
- // Normally, we don't want to try too hard to write every label and | ||
- // uberblock. If there is a flaky disk, we don't want the rest of the | ||
- // sync process to block while we retry. But if we can't write a | ||
- // single label out, we should retry with ZIO_FLAG_TRYHARD before | ||
- // bailing out and declaring the pool faulted. | ||
- if tryhard { | ||
- flags |= ZIO_FLAG_TRYHARD; | ||
- } | ||
- | ||
- assert!(ub->ub_txg <= txg); | ||
- | ||
- // If this isn't a resync due to I/O errors, | ||
- // and nothing changed in this transaction group, | ||
- // and the vdev configuration hasn't changed, | ||
- // then there's nothing to do. | ||
- if ub->ub_txg < txg && | ||
- uberblock_update(ub, spa->spa_root_vdev, txg) == false && | ||
- list_is_empty(&spa->spa_config_dirty_list) { | ||
- return 0; | ||
- } | ||
- | ||
- if txg > spa_freeze_txg(spa) { | ||
- return 0; | ||
- } | ||
- | ||
- assert!(txg <= spa->spa_final_txg); | ||
- | ||
- // Flush the write cache of every disk that's been written to | ||
- // in this transaction group. This ensures that all blocks | ||
- // written in this txg will be committed to stable storage | ||
- // before any uberblock that references them. | ||
- let zio = Zio::root(spa, None, None, flags); | ||
- | ||
- for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd; | ||
- vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) | ||
- zio.flush(vd); | ||
- | ||
- zio.wait(); | ||
- | ||
- // Sync out the even labels (L0, L2) for every dirty vdev. If the | ||
- // system dies in the middle of this process, that's OK: all of the | ||
- // even labels that made it to disk will be newer than any uberblock, | ||
- // and will therefore be considered invalid. The odd labels (L1, L3), | ||
- // which have not yet been touched, will still be valid. We flush | ||
- // the new labels to disk to ensure that all even-label updates | ||
- // are committed to stable storage before the uberblock update. | ||
- if (error = vdev_label_sync_list(spa, 0, txg, flags)) != 0 { | ||
- return error; | ||
- } | ||
- | ||
- // Sync the uberblocks to all vdevs in svd[]. | ||
- // If the system dies in the middle of this step, there are two cases | ||
- // to consider, and the on-disk state is consistent either way: | ||
- // | ||
- // (1) If none of the new uberblocks made it to disk, then the | ||
- // previous uberblock will be the newest, and the odd labels | ||
- // (which had not yet been touched) will be valid with respect | ||
- // to that uberblock. | ||
- // | ||
- // (2) If one or more new uberblocks made it to disk, then they | ||
- // will be the newest, and the even labels (which had all | ||
- // been successfully committed) will be valid with respect | ||
- // to the new uberblocks. | ||
- if (error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0 { | ||
- return error; | ||
- } | ||
- | ||
- // Sync out odd labels for every dirty vdev. If the system dies | ||
- // in the middle of this process, the even labels and the new | ||
- // uberblocks will suffice to open the pool. The next time | ||
- // the pool is opened, the first thing we'll do -- before any | ||
- // user data is modified -- is mark every vdev dirty so that | ||
- // all labels will be brought up to date. We flush the new labels | ||
- // to disk to ensure that all odd-label updates are committed to | ||
- // stable storage before the next transaction group begins. | ||
- vdev_label_sync_list(spa, 1, txg, flags) | ||
-} |
682
crates/zfs/vdev_queue.rs
@@ -1,682 +0,0 @@ | ||
-use super::zio; | ||
- | ||
-// ZFS IO Scheduler | ||
-// --------------- | ||
-// | ||
-// ZFS issues IO operations to leaf vdevs to satisfy and complete zios. The | ||
-// IO scheduler determines when and in what order those operations are | ||
-// issued. The IO scheduler divides operations into five IO classes | ||
-// prioritized in the following order: sync read, sync write, async read, | ||
-// async write, and scrub/resilver. Each queue defines the minimum and | ||
-// maximum number of concurrent operations that may be issued to the device. | ||
-// In addition, the device has an aggregate maximum. Note that the sum of the | ||
-// per-queue minimums must not exceed the aggregate maximum. If the | ||
-// sum of the per-queue maximums exceeds the aggregate maximum, then the | ||
-// number of active IOs may reach zfs_vdev_max_active, in which case no | ||
-// further IOs will be issued regardless of whether all per-queue | ||
-// minimums have been met. | ||
-// | ||
-// For many physical devices, throughput increases with the number of | ||
-// concurrent operations, but latency typically suffers. Further, physical | ||
-// devices typically have a limit at which more concurrent operations have no | ||
-// effect on throughput or can actually cause it to decrease. | ||
-// | ||
-// The scheduler selects the next operation to issue by first looking for an | ||
-// IO class whose minimum has not been satisfied. Once all are satisfied and | ||
-// the aggregate maximum has not been hit, the scheduler looks for classes | ||
-// whose maximum has not been satisfied. Iteration through the IO classes is | ||
-// done in the order specified above. No further operations are issued if the | ||
-// aggregate maximum number of concurrent operations has been hit or if there | ||
-// are no operations queued for an IO class that has not hit its maximum. | ||
-// Every time an IO is queued or an operation completes, the IO scheduler | ||
-// looks for new operations to issue. | ||
-// | ||
-// All IO classes have a fixed maximum number of outstanding operations | ||
-// except for the async write class. Asynchronous writes represent the data | ||
-// that is committed to stable storage during the syncing stage for | ||
-// transaction groups (see txg.c). Transaction groups enter the syncing state | ||
-// periodically so the number of queued async writes will quickly burst up and | ||
-// then bleed down to zero. Rather than servicing them as quickly as possible, | ||
-// the IO scheduler changes the maximum number of active async write IOs | ||
-// according to the amount of dirty data in the pool (see dsl_pool.c). Since | ||
-// both throughput and latency typically increase with the number of | ||
-// concurrent operations issued to physical devices, reducing the burstiness | ||
-// in the number of concurrent operations also stabilizes the response time of | ||
-// operations from other -- and in particular synchronous -- queues. In broad | ||
-// strokes, the IO scheduler will issue more concurrent operations from the | ||
-// async write queue as there's more dirty data in the pool. | ||
-// | ||
-// Async Writes | ||
-// | ||
-// The number of concurrent operations issued for the async write IO class | ||
-// follows a piece-wise linear function defined by a few adjustable points. | ||
-// | ||
-// | o---------| <-- zfs_vdev_async_write_max_active | ||
-// ^ | /^ | | ||
-// | | / | | | ||
-// active | / | | | ||
-// IO | / | | | ||
-// count | / | | | ||
-// | / | | | ||
-// |------------o | | <-- zfs_vdev_async_write_min_active | ||
-// 0|____________^______|_________| | ||
-// 0% | | 100% of zfs_dirty_data_max | ||
-// | | | ||
-// | `-- zfs_vdev_async_write_active_max_dirty_percent | ||
-// `--------- zfs_vdev_async_write_active_min_dirty_percent | ||
-// | ||
-// Until the amount of dirty data exceeds a minimum percentage of the dirty | ||
-// data allowed in the pool, the IO scheduler will limit the number of | ||
-// concurrent operations to the minimum. As that threshold is crossed, the | ||
-// number of concurrent operations issued increases linearly to the maximum at | ||
-// the specified maximum percentage of the dirty data allowed in the pool. | ||
-// | ||
-// Ideally, the amount of dirty data on a busy pool will stay in the sloped | ||
-// part of the function between zfs_vdev_async_write_active_min_dirty_percent | ||
-// and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the | ||
-// maximum percentage, this indicates that the rate of incoming data is | ||
-// greater than the rate that the backend storage can handle. In this case, we | ||
-// must further throttle incoming writes (see dmu_tx_delay() for details). | ||
- | ||
-// the sum of each queue's max_active. It must be at least the sum of each | ||
-// queue's min_active. | ||
-uint32_t zfs_vdev_max_active = 1000; | ||
- | ||
-// Per-queue limits on the number of IOs active to each device. If the | ||
-// number of active IOs is < zfs_vdev_max_active, then the min_active comes | ||
-// into play. We will send min_active from each queue, and then select from | ||
-// queues in the order defined by zio_priority_t. | ||
-// | ||
-// In general, smaller max_active's will lead to lower latency of synchronous | ||
-// operations. Larger max_active's may lead to higher overall throughput, | ||
-// depending on underlying storage. | ||
-// | ||
-// The ratio of the queues' max_actives determines the balance of performance | ||
-// between reads, writes, and scrubs. E.g., increasing | ||
-// zfs_vdev_scrub_max_active will cause the scrub or resilver to complete | ||
-// more quickly, but reads and writes to have higher latency and lower | ||
-// throughput. | ||
-uint32_t zfs_vdev_sync_read_min_active = 10; | ||
-uint32_t zfs_vdev_sync_read_max_active = 10; | ||
-uint32_t zfs_vdev_sync_write_min_active = 10; | ||
-uint32_t zfs_vdev_sync_write_max_active = 10; | ||
-uint32_t zfs_vdev_async_read_min_active = 1; | ||
-uint32_t zfs_vdev_async_read_max_active = 3; | ||
-uint32_t zfs_vdev_async_write_min_active = 1; | ||
-uint32_t zfs_vdev_async_write_max_active = 10; | ||
-uint32_t zfs_vdev_scrub_min_active = 1; | ||
-uint32_t zfs_vdev_scrub_max_active = 2; | ||
- | ||
-// When the pool has less than zfs_vdev_async_write_active_min_dirty_percent | ||
-// dirty data, use zfs_vdev_async_write_min_active. When it has more than | ||
-// zfs_vdev_async_write_active_max_dirty_percent, use | ||
-// zfs_vdev_async_write_max_active. The value is linearly interpolated | ||
-// between min and max. | ||
-int zfs_vdev_async_write_active_min_dirty_percent = 30; | ||
-int zfs_vdev_async_write_active_max_dirty_percent = 60; | ||
- | ||
-// To reduce IOPs, we aggregate small adjacent IOs into one large IO. | ||
-// For read IOs, we also aggregate across small adjacency gaps; for writes | ||
-// we include spans of optional IOs to aid aggregation at the disk even when | ||
-// they aren't able to help us aggregate at this level. | ||
-int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; | ||
-int zfs_vdev_read_gap_limit = 32 << 10; | ||
-int zfs_vdev_write_gap_limit = 4 << 10; | ||
- | ||
-fn vdev_queue_offset_compare(const void *x1, const void *x2) -> i32 { | ||
- const zio_t *z1 = x1; | ||
- const zio_t *z2 = x2; | ||
- | ||
- if z1.offset < z2.offset { | ||
- return -1; | ||
- } | ||
- if z1.offset > z2.offset { | ||
- return 1; | ||
- } | ||
- | ||
- if z1 < z2 { | ||
- return -1; | ||
- } | ||
- if z1 > z2 { | ||
- return 1; | ||
- } | ||
- | ||
- return 0; | ||
-} | ||
- | ||
-static inline avl_tree_t * | ||
-vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) | ||
-{ | ||
- return (&vq->vq_class[p].vqc_queued_tree); | ||
-} | ||
- | ||
-static inline avl_tree_t * | ||
-vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) | ||
-{ | ||
- assert!(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE); | ||
- if t == ZIO_TYPE_READ { | ||
- return &vq->vq_read_offset_tree; | ||
- } else { | ||
- return &vq->vq_write_offset_tree; | ||
- } | ||
-} | ||
- | ||
-int | ||
-vdev_queue_timestamp_compare(const void *x1, const void *x2) | ||
-{ | ||
- const zio_t *z1 = x1; | ||
- const zio_t *z2 = x2; | ||
- | ||
- if (z1->io_timestamp < z2->io_timestamp) | ||
- return (-1); | ||
- if (z1->io_timestamp > z2->io_timestamp) | ||
- return (1); | ||
- | ||
- if (z1 < z2) | ||
- return (-1); | ||
- if (z1 > z2) | ||
- return (1); | ||
- | ||
- return (0); | ||
-} | ||
- | ||
-static int | ||
-vdev_queue_class_min_active(zio_priority_t p) | ||
-{ | ||
- switch (p) { | ||
- case ZIO_PRIORITY_SYNC_READ: | ||
- return (zfs_vdev_sync_read_min_active); | ||
- case ZIO_PRIORITY_SYNC_WRITE: | ||
- return (zfs_vdev_sync_write_min_active); | ||
- case ZIO_PRIORITY_ASYNC_READ: | ||
- return (zfs_vdev_async_read_min_active); | ||
- case ZIO_PRIORITY_ASYNC_WRITE: | ||
- return (zfs_vdev_async_write_min_active); | ||
- case ZIO_PRIORITY_SCRUB: | ||
- return (zfs_vdev_scrub_min_active); | ||
- default: | ||
- panic("invalid priority %u", p); | ||
- return (0); | ||
- } | ||
-} | ||
- | ||
-static int | ||
-vdev_queue_max_async_writes(spa_t *spa) | ||
-{ | ||
- int writes; | ||
- uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; | ||
- uint64_t min_bytes = zfs_dirty_data_max * | ||
- zfs_vdev_async_write_active_min_dirty_percent / 100; | ||
- uint64_t max_bytes = zfs_dirty_data_max * | ||
- zfs_vdev_async_write_active_max_dirty_percent / 100; | ||
- | ||
- // Sync tasks correspond to interactive user actions. To reduce the | ||
- // execution time of those actions we push data out as fast as possible. | ||
- if (spa_has_pending_synctask(spa)) { | ||
- return zfs_vdev_async_write_max_active; | ||
- } | ||
- | ||
- if dirty < min_bytes { | ||
- return zfs_vdev_async_write_min_active; | ||
- } | ||
- if dirty > max_bytes { | ||
- return zfs_vdev_async_write_max_active; | ||
- } | ||
- | ||
- // linear interpolation: | ||
- // slope = (max_writes - min_writes) / (max_bytes - min_bytes) | ||
- // move right by min_bytes | ||
- // move up by min_writes | ||
- writes = (dirty - min_bytes) * | ||
- (zfs_vdev_async_write_max_active - zfs_vdev_async_write_min_active) / | ||
- (max_bytes - min_bytes) + zfs_vdev_async_write_min_active; | ||
- assert!(writes >= zfs_vdev_async_write_min_active); | ||
- assert!(writes <= zfs_vdev_async_write_max_active); | ||
- return (writes); | ||
-} | ||
- | ||
-fn vdev_queue_class_max_active(spa_t *spa, p: zio::Priority) -> int { | ||
- match p { | ||
- zio::Priority::SyncRead => zfs_vdev_sync_read_max_active, | ||
- zio::Priority::SyncWrite => zfs_vdev_sync_write_max_active, | ||
- zio::Priority::AsyncRead => zfs_vdev_async_read_max_active, | ||
- zio::Priority::AsyncWrite => vdev_queue_max_async_writes(spa), | ||
- zio::Priority::Scrub => zfs_vdev_scrub_max_active, | ||
- _ => { | ||
- panic!("invalid priority {}", p); | ||
- 0 | ||
- } | ||
- } | ||
-} | ||
- | ||
-// Return the IO class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if | ||
-// there is no eligible class. | ||
-static zio_priority_t | ||
-vdev_queue_class_to_issue(vdev_queue_t *vq) | ||
-{ | ||
- spa_t *spa = vq->vq_vdev->vdev_spa; | ||
- zio_priority_t p; | ||
- | ||
- if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) | ||
- return (ZIO_PRIORITY_NUM_QUEUEABLE); | ||
- | ||
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { | ||
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && | ||
- vq->vq_class[p].vqc_active < | ||
- vdev_queue_class_min_active(p)) | ||
- return (p); | ||
- } | ||
- | ||
- // If we haven't found a queue, look for one that hasn't reached its | ||
- // maximum # outstanding IOs. | ||
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { | ||
- if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && | ||
- vq->vq_class[p].vqc_active < | ||
- vdev_queue_class_max_active(spa, p)) | ||
- return (p); | ||
- } | ||
- | ||
- return (ZIO_PRIORITY_NUM_QUEUEABLE); | ||
-} | ||
- | ||
-void | ||
-vdev_queue_init(vdev_t *vd) | ||
-{ | ||
- vdev_queue_t *vq = &vd->vdev_queue; | ||
- zio_priority_t p; | ||
- | ||
- mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); | ||
- vq->vq_vdev = vd; | ||
- taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); | ||
- | ||
- avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, | ||
- sizeof (zio_t), offsetof(struct zio, io_queue_node)); | ||
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), | ||
- vdev_queue_offset_compare, sizeof (zio_t), | ||
- offsetof(struct zio, io_offset_node)); | ||
- avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), | ||
- vdev_queue_offset_compare, sizeof (zio_t), | ||
- offsetof(struct zio, io_offset_node)); | ||
- | ||
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { | ||
- int (*compfn) (const void *, const void *); | ||
- | ||
- // The synchronous IO queues are dispatched in FIFO rather | ||
- // than LBA order. This provides more consistent latency for | ||
- // these IOs. | ||
- if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) | ||
- compfn = vdev_queue_timestamp_compare; | ||
- else | ||
- compfn = vdev_queue_offset_compare; | ||
- avl_create(vdev_queue_class_tree(vq, p), compfn, | ||
- sizeof (zio_t), offsetof(struct zio, io_queue_node)); | ||
- } | ||
-} | ||
- | ||
-void | ||
-vdev_queue_fini(vdev_t *vd) | ||
-{ | ||
- vdev_queue_t *vq = &vd->vdev_queue; | ||
- zio_priority_t p; | ||
- | ||
- for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) | ||
- avl_destroy(vdev_queue_class_tree(vq, p)); | ||
- avl_destroy(&vq->vq_active_tree); | ||
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); | ||
- avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); | ||
- | ||
- mutex_destroy(&vq->vq_lock); | ||
-} | ||
- | ||
-static void | ||
-vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) | ||
-{ | ||
- spa_t *spa = zio->io_spa; | ||
- spa_stats_history_t *ssh = &spa->spa_stats.io_history; | ||
- | ||
- assert!(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); | ||
- avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); | ||
- avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); | ||
- | ||
- if (ssh->kstat != NULL) { | ||
- mutex_enter(&ssh->lock); | ||
- kstat_waitq_enter(ssh->kstat->ks_data); | ||
- mutex_exit(&ssh->lock); | ||
- } | ||
-} | ||
- | ||
-static void | ||
-vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) | ||
-{ | ||
- spa_t *spa = zio->io_spa; | ||
- spa_stats_history_t *ssh = &spa->spa_stats.io_history; | ||
- | ||
- assert!(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); | ||
- avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); | ||
- avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); | ||
- | ||
- if (ssh->kstat != NULL) { | ||
- mutex_enter(&ssh->lock); | ||
- kstat_waitq_exit(ssh->kstat->ks_data); | ||
- mutex_exit(&ssh->lock); | ||
- } | ||
-} | ||
- | ||
-static void | ||
-vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) | ||
-{ | ||
- spa_t *spa = zio->io_spa; | ||
- spa_stats_history_t *ssh = &spa->spa_stats.io_history; | ||
- | ||
- ASSERT(MUTEX_HELD(&vq->vq_lock)); | ||
- assert!(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); | ||
- vq->vq_class[zio->io_priority].vqc_active++; | ||
- avl_add(&vq->vq_active_tree, zio); | ||
- | ||
- if (ssh->kstat != NULL) { | ||
- mutex_enter(&ssh->lock); | ||
- kstat_runq_enter(ssh->kstat->ks_data); | ||
- mutex_exit(&ssh->lock); | ||
- } | ||
-} | ||
- | ||
-static void | ||
-vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) | ||
-{ | ||
- spa_t *spa = zio->io_spa; | ||
- spa_stats_history_t *ssh = &spa->spa_stats.io_history; | ||
- | ||
- ASSERT(MUTEX_HELD(&vq->vq_lock)); | ||
- assert!(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); | ||
- vq->vq_class[zio->io_priority].vqc_active--; | ||
- avl_remove(&vq->vq_active_tree, zio); | ||
- | ||
- if (ssh->kstat != NULL) { | ||
- kstat_io_t *ksio = ssh->kstat->ks_data; | ||
- | ||
- mutex_enter(&ssh->lock); | ||
- kstat_runq_exit(ksio); | ||
- if (zio->io_type == ZIO_TYPE_READ) { | ||
- ksio->reads++; | ||
- ksio->nread += zio->io_size; | ||
- } else if (zio->io_type == ZIO_TYPE_WRITE) { | ||
- ksio->writes++; | ||
- ksio->nwritten += zio->io_size; | ||
- } | ||
- mutex_exit(&ssh->lock); | ||
- } | ||
-} | ||
- | ||
-fn vdev_queue_agg_io_done(aio: &mut Zio) { | ||
- if (aio.zio_type == ZIO_TYPE_READ) { | ||
- zio_t *pio; | ||
- while (pio = zio_walk_parents(aio)) != NULL { | ||
- bcopy(aio.data + (pio.offset - aio.offset), pio.data, pio.size); | ||
- } | ||
- } | ||
- | ||
- zio_buf_free(aio.data, aio.size); | ||
-} | ||
- | ||
-// Compute the range spanned by two IOs, which is the endpoint of the last | ||
-// (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). | ||
-// Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); | ||
-// thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. | ||
-#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) | ||
-#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) | ||
- | ||
-static zio_t * | ||
-vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) | ||
-{ | ||
- zio_t *first, *last, *aio, *dio, *mandatory, *nio; | ||
- uint64_t maxgap = 0; | ||
- uint64_t size; | ||
- boolean_t stretch = B_FALSE; | ||
- avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); | ||
- enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; | ||
- | ||
- if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) | ||
- return (NULL); | ||
- | ||
- // Prevent users from setting the zfs_vdev_aggregation_limit | ||
- // tuning larger than SPA_MAXBLOCKSIZE. | ||
- zfs_vdev_aggregation_limit = | ||
- MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE); | ||
- | ||
- first = last = zio; | ||
- | ||
- if (zio->io_type == ZIO_TYPE_READ) | ||
- maxgap = zfs_vdev_read_gap_limit; | ||
- | ||
- // We can aggregate IOs that are sufficiently adjacent and of | ||
- // the same flavor, as expressed by the AGG_INHERIT flags. | ||
- // The latter requirement is necessary so that certain | ||
- // attributes of the IO, such as whether it's a normal IO | ||
- // or a scrub/resilver, can be preserved in the aggregate. | ||
- // We can include optional IOs, but don't allow them | ||
- // to begin a range as they add no benefit in that situation. | ||
- | ||
- // We keep track of the last non-optional IO. | ||
- mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; | ||
- | ||
- // Walk backwards through sufficiently contiguous IOs | ||
- // recording the last non-option IO. | ||
- while ((dio = AVL_PREV(t, first)) != NULL && | ||
- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && | ||
- IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && | ||
- IO_GAP(dio, first) <= maxgap) { | ||
- first = dio; | ||
- if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) | ||
- mandatory = first; | ||
- } | ||
- | ||
- // Skip any initial optional IOs. | ||
- while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { | ||
- first = AVL_NEXT(t, first); | ||
- ASSERT(first != NULL); | ||
- } | ||
- | ||
- | ||
- // Walk forward through sufficiently contiguous IOs. | ||
- while ((dio = AVL_NEXT(t, last)) != NULL && | ||
- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && | ||
- IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && | ||
- IO_GAP(last, dio) <= maxgap) { | ||
- last = dio; | ||
- if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) | ||
- mandatory = last; | ||
- } | ||
- | ||
- // Now that we've established the range of the IO aggregation | ||
- // we must decide what to do with trailing optional IOs. | ||
- // For reads, there's nothing to do. While we are unable to | ||
- // aggregate further, it's possible that a trailing optional | ||
- // IO would allow the underlying device to aggregate with | ||
- // subsequent IOs. We must therefore determine if the next | ||
- // non-optional IO is close enough to make aggregation | ||
- // worthwhile. | ||
- if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { | ||
- zio_t *nio = last; | ||
- while ((dio = AVL_NEXT(t, nio)) != NULL && | ||
- IO_GAP(nio, dio) == 0 && | ||
- IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { | ||
- nio = dio; | ||
- if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { | ||
- stretch = B_TRUE; | ||
- break; | ||
- } | ||
- } | ||
- } | ||
- | ||
- if (stretch) { | ||
- // This may be a no-op. | ||
- dio = AVL_NEXT(t, last); | ||
- dio->io_flags &= ~ZIO_FLAG_OPTIONAL; | ||
- } else { | ||
- while (last != mandatory && last != first) { | ||
- ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); | ||
- last = AVL_PREV(t, last); | ||
- ASSERT(last != NULL); | ||
- } | ||
- } | ||
- | ||
- if (first == last) | ||
- return (NULL); | ||
- | ||
- size = IO_SPAN(first, last); | ||
- assert!(size, <=, zfs_vdev_aggregation_limit); | ||
- | ||
- aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, | ||
- zio_buf_alloc(size), size, first->io_type, zio->io_priority, | ||
- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, | ||
- vdev_queue_agg_io_done, NULL); | ||
- aio->io_timestamp = first->io_timestamp; | ||
- | ||
- nio = first; | ||
- do { | ||
- dio = nio; | ||
- nio = AVL_NEXT(t, dio); | ||
- assert!(dio->io_type, ==, aio->io_type); | ||
- | ||
- if (dio->io_flags & ZIO_FLAG_NODATA) { | ||
- assert!(dio->io_type, ==, ZIO_TYPE_WRITE); | ||
- bzero((char *)aio->io_data + (dio->io_offset - | ||
- aio->io_offset), dio->io_size); | ||
- } else if (dio->io_type == ZIO_TYPE_WRITE) { | ||
- bcopy(dio->io_data, (char *)aio->io_data + | ||
- (dio->io_offset - aio->io_offset), | ||
- dio->io_size); | ||
- } | ||
- | ||
- zio_add_child(dio, aio); | ||
- vdev_queue_io_remove(vq, dio); | ||
- zio_vdev_io_bypass(dio); | ||
- zio_execute(dio); | ||
- } while (dio != last); | ||
- | ||
- return (aio); | ||
-} | ||
- | ||
-fn vdev_queue_io_to_issue(vdev_queue_t *vq) -> Option<Zio> { | ||
- zio_t *zio, *aio; | ||
- zio_priority_t p; | ||
- avl_index_t idx; | ||
- avl_tree_t *tree; | ||
- | ||
-again: | ||
- ASSERT(MUTEX_HELD(&vq->vq_lock)); | ||
- | ||
- p = vdev_queue_class_to_issue(vq); | ||
- | ||
- if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { | ||
- // No eligible queued IOs | ||
- return (NULL); | ||
- } | ||
- | ||
- // For LBA-ordered queues (async / scrub), issue the IO which follows | ||
- // the most recently issued IO in LBA (offset) order. | ||
- // | ||
- // For FIFO queues (sync), issue the IO with the lowest timestamp. | ||
- tree = vdev_queue_class_tree(vq, p); | ||
- vq->vq_io_search.io_timestamp = 0; | ||
- vq->vq_io_search.io_offset = vq->vq_last_offset + 1; | ||
- //VERIFY(avl_find(tree, &vq->vq_io_search, &idx) == NULL); | ||
- zio = avl_nearest(tree, idx, AVL_AFTER); | ||
- if (zio == NULL) | ||
- zio = avl_first(tree); | ||
- assert!(zio->io_priority == p); | ||
- | ||
- aio = vdev_queue_aggregate(vq, zio); | ||
- if (aio != NULL) | ||
- zio = aio; | ||
- else | ||
- vdev_queue_io_remove(vq, zio); | ||
- | ||
- // If the IO is or was optional and therefore has no data, we need to | ||
- // simply discard it. We need to drop the vdev queue's lock to avoid a | ||
- // deadlock that we could encounter since this IO will complete | ||
- // immediately. | ||
- if (zio->io_flags & ZIO_FLAG_NODATA) { | ||
- mutex_exit(&vq->vq_lock); | ||
- zio_vdev_io_bypass(zio); | ||
- zio_execute(zio); | ||
- mutex_enter(&vq->vq_lock); | ||
- goto again; | ||
- } | ||
- | ||
- vdev_queue_pending_add(vq, zio); | ||
- vq->vq_last_offset = zio->io_offset; | ||
- | ||
- return (zio); | ||
-} | ||
- | ||
-pub fn vdev_queue_io(zio_t *zio) -> Option<Zio> { | ||
- vdev_queue_t *vq = &zio.vd.vdev_queue; | ||
- | ||
- if zio->io_flags & ZIO_FLAG_DONT_QUEUE != 0 { | ||
- return zio; | ||
- } | ||
- | ||
- // Children IOs inherent their parent's priority, which might | ||
- // not match the child's IO type. Fix it up here. | ||
- if zio.zio_type == ZIO_TYPE_READ { | ||
- if zio->io_priority != ZIO_PRIORITY_SYNC_READ && | ||
- zio->io_priority != ZIO_PRIORITY_ASYNC_READ && | ||
- zio->io_priority != ZIO_PRIORITY_SCRUB | ||
- { | ||
- zio->io_priority = ZIO_PRIORITY_ASYNC_READ; | ||
- } | ||
- } else { | ||
- assert!(zio.zio_type == ZIO_TYPE_WRITE); | ||
- if (zio.priority != ZIO_PRIORITY_SYNC_WRITE && | ||
- zio.priority != ZIO_PRIORITY_ASYNC_WRITE) | ||
- zio.priority = ZIO_PRIORITY_ASYNC_WRITE; | ||
- } | ||
- | ||
- zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; | ||
- | ||
- mutex_enter(&vq->vq_lock); | ||
- zio.timestamp = gethrtime(); | ||
- vdev_queue_io_add(vq, zio); | ||
- let nio = vdev_queue_io_to_issue(vq); | ||
- mutex_exit(&vq->vq_lock); | ||
- | ||
- if let Some(nio) = nio { | ||
- if nio.done == vdev_queue_agg_io_done { | ||
- nio.no_wait(); | ||
- return None; | ||
- } | ||
- } | ||
- | ||
- nio | ||
-} | ||
- | ||
-fn vdev_queue_io_done(zio_t *zio) { | ||
- vdev_queue_t *vq = &zio->io_vd->vdev_queue; | ||
- zio_t *nio; | ||
- | ||
- if zio_injection_enabled { | ||
- delay(SEC_TO_TICK(zio_handle_io_delay(zio))); | ||
- } | ||
- | ||
- mutex_enter(&vq->vq_lock); | ||
- | ||
- vdev_queue_pending_remove(vq, zio); | ||
- | ||
- zio.delta = gethrtime() - zio.timestamp; | ||
- vq.io_complete_ts = gethrtime(); | ||
- vq.io_delta_ts = vq.io_complete_ts - zio.timestamp; | ||
- | ||
- while (nio = vdev_queue_io_to_issue(vq)) != NULL { | ||
- mutex_exit(&vq->vq_lock); | ||
- if (nio.done == vdev_queue_agg_io_done) { | ||
- nio.no_wait(); | ||
- } else { | ||
- zio_vdev_io_reissue(nio); | ||
- nio.execute(); | ||
- } | ||
- mutex_enter(&vq.lock); | ||
- } | ||
- | ||
- mutex_exit(&vq->vq_lock); | ||
-} |
145
crates/zfs/xdr/mem_ops.rs
@@ -1,145 +0,0 @@ | ||
-use std::{mem, ptr}; | ||
- | ||
-use super::{XdrOps, XdrError, XdrResult}; | ||
- | ||
-pub struct MemOps<'a> { | ||
- pos: usize, | ||
- buffer: &'a mut [u8], | ||
-} | ||
- | ||
-impl<'a> MemOps<'a> { | ||
- pub fn new(buffer: &'a mut [u8]) -> Self { | ||
- MemOps { | ||
- pos: 0, | ||
- buffer: buffer, | ||
- } | ||
- } | ||
-} | ||
- | ||
-// Xdr encodes things in big endian and values are aligned at 4 bytes. For example, a u8 would take | ||
-// up 4 bytes when serialized. | ||
-impl<'a> XdrOps for MemOps<'a> { | ||
- fn get_i64(&mut self) -> XdrResult<i64> { | ||
- if self.pos >= self.buffer.len() { | ||
- Err(XdrError) | ||
- } else if self.buffer.len() - self.pos < 8 { | ||
- Err(XdrError) | ||
- } else { | ||
- let d: &i64 = unsafe { mem::transmute(&self.buffer[self.pos]) }; | ||
- // let val_d = i64::from_be(*d); | ||
- self.pos += 8; | ||
- Ok(i64::from_be(*d)) | ||
- } | ||
- } | ||
- | ||
- fn put_i64(&mut self, l: i64) -> XdrResult<()> { | ||
- if self.pos >= self.buffer.len() || self.buffer.len() - self.pos < 8 { | ||
- // Buffer is too small | ||
- return Err(XdrError); | ||
- } | ||
- | ||
- let d: &mut i64 = unsafe { mem::transmute(&mut self.buffer[self.pos]) }; | ||
- *d = l.to_be(); | ||
- self.pos += 8; | ||
- Ok(()) | ||
- } | ||
- | ||
- fn get_i32(&mut self) -> XdrResult<i32> { | ||
- if self.pos >= self.buffer.len() { | ||
- Err(XdrError) | ||
- } else if self.buffer.len() - self.pos < 4 { | ||
- Err(XdrError) | ||
- } else { | ||
- let d: &i32 = unsafe { mem::transmute(&self.buffer[self.pos]) }; | ||
- self.pos += 4; | ||
- Ok(i32::from_be(*d)) | ||
- } | ||
- } | ||
- | ||
- fn put_i32(&mut self, i: i32) -> XdrResult<()> { | ||
- if self.pos >= self.buffer.len() || self.buffer.len() - self.pos < 4 { | ||
- // Buffer is too small | ||
- return Err(XdrError); | ||
- } | ||
- | ||
- let d: &mut i32 = unsafe { mem::transmute(&mut self.buffer[self.pos]) }; | ||
- *d = i.to_be(); | ||
- self.pos += 4; | ||
- Ok(()) | ||
- } | ||
- | ||
- fn get_bytes(&mut self, bytes: &mut [u8]) -> XdrResult<()> { | ||
- if bytes.is_empty() { | ||
- return Ok(()); | ||
- } | ||
- if self.pos >= self.buffer.len() { | ||
- Err(XdrError) | ||
- } else if self.buffer.len() - self.pos < bytes.len() { | ||
- Err(XdrError) | ||
- } else { | ||
- // Technically the upper bound on this slice doesn't have to be there | ||
- let src = self.buffer[self.pos..self.pos + bytes.len()].as_ptr(); | ||
- let dst = bytes.as_mut_ptr(); | ||
- unsafe { | ||
- ptr::copy(src, dst, bytes.len()); | ||
- } | ||
- self.pos += bytes.len(); | ||
- | ||
- Ok(()) | ||
- } | ||
- } | ||
- | ||
- fn put_bytes(&mut self, bytes: &[u8]) -> XdrResult<()> { | ||
- if self.pos >= self.buffer.len() || self.buffer.len() - self.pos < bytes.len() { | ||
- // Buffer is too small | ||
- return Err(XdrError); | ||
- } | ||
- | ||
- let src = bytes.as_ptr(); | ||
- // Technically the upper bound on this slice doesn't have to be there | ||
- let dst = self.buffer[self.pos..self.pos + bytes.len()].as_mut_ptr(); | ||
- unsafe { | ||
- ptr::copy(src, dst, bytes.len()); | ||
- } | ||
- self.pos += bytes.len(); | ||
- | ||
- Ok(()) | ||
- } | ||
- | ||
- fn get_pos(&self) -> usize { | ||
- self.pos | ||
- } | ||
- | ||
- fn set_pos(&mut self, new_pos: usize) -> XdrResult<()> { | ||
- self.pos = new_pos; | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-#[test] | ||
-fn test_mem_ops_i64() { | ||
- let mem_ops = MemOps::new(&mut [1, 1, 0, 0]); | ||
- assert!(mem_ops.get_i32() == 257); | ||
-} | ||
- | ||
-#[test] | ||
-fn test_mem_ops_i64_and_back() { | ||
- let mut mem_ops = MemOps::new(&mut [0; 8]); | ||
- mem_ops.put_i64(424242); | ||
- mem_ops.set_pos(0); | ||
- assert!(mem_ops.get_i64() == 424242); | ||
-} | ||
- | ||
-#[test] | ||
-fn test_mem_ops_i32() { | ||
- let mem_ops = MemOps::new(&mut [1, 1, 0, 0]); | ||
- assert!(mem_ops.get_i32() == 257); | ||
-} | ||
- | ||
-#[test] | ||
-fn test_mem_ops_i32_and_back() { | ||
- let mut mem_ops = MemOps::new(&mut [0; 4]); | ||
- mem_ops.put_i32(424242); | ||
- mem_ops.set_pos(0); | ||
- assert!(mem_ops.get_i32() == 424242); | ||
-} |
5
crates/zfs/xdr/mod.rs
@@ -1,5 +0,0 @@ | ||
-pub use self::xdr::*; | ||
-pub use self::mem_ops::MemOps; | ||
- | ||
-pub mod xdr; | ||
-pub mod mem_ops; |
219
crates/zfs/xdr/xdr.rs
@@ -1,219 +0,0 @@ | ||
-// use std::*; | ||
- | ||
-#[derive(Debug)] | ||
-pub struct XdrError; | ||
- | ||
-pub type XdrResult<T> = Result<T, XdrError>; | ||
- | ||
-pub enum XdrOp { | ||
- Encode, | ||
- Decode, | ||
- Free, | ||
-} | ||
- | ||
-// TODO: Return `XdrResult` instead | ||
-pub trait XdrOps { | ||
- /// Get a i64 from underlying stream | ||
- fn get_i64(&mut self) -> XdrResult<i64>; | ||
- | ||
- /// Put a i64 to underlying stream | ||
- fn put_i64(&mut self, l: i64) -> XdrResult<()>; | ||
- | ||
- /// Get a i32 from underlying stream | ||
- fn get_i32(&mut self) -> XdrResult<i32>; | ||
- | ||
- /// Put a i32 to underlying stream | ||
- fn put_i32(&mut self, i: i32) -> XdrResult<()>; | ||
- | ||
- /// Get some bytes from the underlying stream | ||
- fn get_bytes(&mut self, bytes: &mut [u8]) -> XdrResult<()>; | ||
- | ||
- /// Put some bytes into the underlying stream | ||
- fn put_bytes(&mut self, bytes: &[u8]) -> XdrResult<()>; | ||
- | ||
- /// Returns bytes off from beginning | ||
- fn get_pos(&self) -> usize; | ||
- | ||
- /// Lets you reposition the stream | ||
- fn set_pos(&mut self, offset: usize) -> XdrResult<()>; | ||
- | ||
-// TODO: Not sure if we'll need this? | ||
-// Buf quick ptr to buffered data | ||
-// fn inline(&mut self, len: usize) -> *mut i32; | ||
- | ||
-// TODO: Not sure if we'll need this? | ||
-// Change, retrieve client info | ||
-// fn control(&mut self, req: isize, op: void *); | ||
-} | ||
- | ||
-pub trait Xdr { | ||
- fn encode_bool(&mut self, i: bool) -> XdrResult<()>; | ||
- fn decode_bool(&mut self) -> XdrResult<bool>; | ||
- | ||
- fn encode_i8(&mut self, i: i8) -> XdrResult<()>; | ||
- fn decode_i8(&mut self) -> XdrResult<i8>; | ||
- | ||
- fn encode_u8(&mut self, u: u8) -> XdrResult<()>; | ||
- fn decode_u8(&mut self) -> XdrResult<u8>; | ||
- | ||
- fn encode_i16(&mut self, i: i16) -> XdrResult<()>; | ||
- fn decode_i16(&mut self) -> XdrResult<i16>; | ||
- | ||
- fn encode_u16(&mut self, u: u16) -> XdrResult<()>; | ||
- fn decode_u16(&mut self) -> XdrResult<u16>; | ||
- | ||
- fn encode_i32(&mut self, i: i32) -> XdrResult<()>; | ||
- fn decode_i32(&mut self) -> XdrResult<i32>; | ||
- | ||
- fn encode_u32(&mut self, u: u32) -> XdrResult<()>; | ||
- fn decode_u32(&mut self) -> XdrResult<u32>; | ||
- | ||
- fn encode_i64(&mut self, i: i64) -> XdrResult<()>; | ||
- fn decode_i64(&mut self) -> XdrResult<i64>; | ||
- | ||
- fn encode_u64(&mut self, u: u64) -> XdrResult<()>; | ||
- fn decode_u64(&mut self) -> XdrResult<u64>; | ||
- | ||
- fn encode_opaque(&mut self, bytes: &[u8]) -> XdrResult<()>; | ||
- fn decode_opaque(&mut self, bytes: &mut [u8]) -> XdrResult<()>; | ||
- | ||
- fn encode_bytes(&mut self, bytes: &[u8]) -> XdrResult<()>; | ||
- fn decode_bytes(&mut self) -> XdrResult<Vec<u8>>; | ||
- | ||
- fn encode_string(&mut self, string: &String) -> XdrResult<()>; | ||
- fn decode_string(&mut self) -> XdrResult<String>; | ||
-} | ||
- | ||
-impl<T: XdrOps> Xdr for T { | ||
- fn encode_bool(&mut self, b: bool) -> XdrResult<()> { | ||
- let i = match b { | ||
- false => 0, | ||
- true => 1, | ||
- }; | ||
- self.put_i32(i) | ||
- } | ||
- | ||
- fn decode_bool(&mut self) -> XdrResult<bool> { | ||
- let i = try!(self.get_i32()); | ||
- match i { | ||
- 0 => Ok(false), | ||
- 1 => Ok(true), | ||
- _ => Err(XdrError), | ||
- } | ||
- } | ||
- | ||
- fn encode_i8(&mut self, i: i8) -> XdrResult<()> { | ||
- self.put_i32(i as i32) | ||
- } | ||
- | ||
- fn decode_i8(&mut self) -> XdrResult<i8> { | ||
- self.get_i32().map(|x| x as i8) | ||
- } | ||
- | ||
- fn encode_u8(&mut self, u: u8) -> XdrResult<()> { | ||
- self.put_i32(u as i32) | ||
- } | ||
- | ||
- fn decode_u8(&mut self) -> XdrResult<u8> { | ||
- self.get_i32().map(|x| x as u8) | ||
- } | ||
- | ||
- fn encode_i16(&mut self, i: i16) -> XdrResult<()> { | ||
- self.put_i32(i as i32) | ||
- } | ||
- | ||
- fn decode_i16(&mut self) -> XdrResult<i16> { | ||
- self.get_i32().map(|x| x as i16) | ||
- } | ||
- | ||
- fn encode_u16(&mut self, u: u16) -> XdrResult<()> { | ||
- self.put_i32(u as i32) | ||
- } | ||
- | ||
- fn decode_u16(&mut self) -> XdrResult<u16> { | ||
- self.get_i32().map(|x| x as u16) | ||
- } | ||
- | ||
- fn encode_i32(&mut self, i: i32) -> XdrResult<()> { | ||
- self.put_i32(i) | ||
- } | ||
- | ||
- fn decode_i32(&mut self) -> XdrResult<i32> { | ||
- self.get_i32() | ||
- } | ||
- | ||
- fn encode_u32(&mut self, u: u32) -> XdrResult<()> { | ||
- self.put_i32(u as i32) | ||
- } | ||
- | ||
- fn decode_u32(&mut self) -> XdrResult<u32> { | ||
- self.get_i32().map(|x| x as u32) | ||
- } | ||
- | ||
- fn encode_i64(&mut self, i: i64) -> XdrResult<()> { | ||
- self.put_i64(i) | ||
- } | ||
- | ||
- fn decode_i64(&mut self) -> XdrResult<i64> { | ||
- self.get_i64() | ||
- } | ||
- | ||
- fn encode_u64(&mut self, u: u64) -> XdrResult<()> { | ||
- self.put_i64(u as i64) | ||
- } | ||
- | ||
- fn decode_u64(&mut self) -> XdrResult<u64> { | ||
- self.get_i64().map(|x| x as u64) | ||
- } | ||
- | ||
- fn encode_opaque(&mut self, bytes: &[u8]) -> XdrResult<()> { | ||
- // XDR byte strings always have len%4 == 0 | ||
- let crud: [u8; 4] = [0; 4]; | ||
- let mut round_up = bytes.len() % 4; | ||
- if round_up > 0 { | ||
- round_up = 4 - round_up; | ||
- } | ||
- try!(self.put_bytes(bytes)); | ||
- try!(self.put_bytes(&crud[0..round_up])); | ||
- Ok(()) | ||
- } | ||
- | ||
- fn decode_opaque(&mut self, bytes: &mut [u8]) -> XdrResult<()> { | ||
- // XDR byte strings always have len%4 == 0 | ||
- let mut crud: [u8; 4] = [0; 4]; | ||
- let mut round_up = bytes.len() % 4; | ||
- if round_up > 0 { | ||
- round_up = 4 - round_up; | ||
- } | ||
- try!(self.get_bytes(bytes)); | ||
- try!(self.get_bytes(&mut crud[0..round_up])); | ||
- Ok(()) | ||
- } | ||
- | ||
- fn encode_bytes(&mut self, bytes: &[u8]) -> XdrResult<()> { | ||
- try!(self.encode_u32(bytes.len() as u32)); | ||
- self.encode_opaque(bytes) | ||
- } | ||
- | ||
- fn decode_bytes(&mut self) -> XdrResult<Vec<u8>> { | ||
- let count = try!(self.decode_u32()); | ||
- let mut bytes = vec![0; count as usize]; | ||
- try!(self.decode_opaque(&mut bytes[..])); | ||
- Ok(bytes) | ||
- } | ||
- | ||
- fn encode_string(&mut self, string: &String) -> XdrResult<()> { | ||
- try!(self.encode_u32(string.as_bytes().len() as u32)); | ||
- self.encode_opaque(string.as_bytes()) | ||
- } | ||
- | ||
- fn decode_string(&mut self) -> XdrResult<String> { | ||
- let count = try!(self.decode_u32()); | ||
- if count > 1024 { | ||
- return Err(XdrError); | ||
- } | ||
- let mut bytes = vec![0; count as usize]; | ||
- try!(self.decode_opaque(&mut bytes[..])); | ||
- String::from_utf8(bytes).map_err(|_| XdrError) | ||
- } | ||
-} |
190
crates/zfs/zap.rs
@@ -1,190 +0,0 @@ | ||
-use std::{fmt, mem, ptr, str}; | ||
-use std::io::Seek; | ||
- | ||
-use super::from_bytes::FromBytes; | ||
- | ||
-const MZAP_ENT_LEN: usize = 64; | ||
-const MZAP_NAME_LEN: usize = MZAP_ENT_LEN - 8 - 4 - 2; | ||
- | ||
-#[repr(u64)] | ||
-#[derive(Copy, Clone, Debug)] | ||
-pub enum ZapObjectType { | ||
- Micro = (1 << 63) + 3, | ||
- Header = (1 << 63) + 1, | ||
- Leaf = 1 << 63, | ||
-} | ||
- | ||
-/// Microzap | ||
-#[repr(packed)] | ||
-pub struct MZapPhys { | ||
- pub block_type: ZapObjectType, // ZapObjectType::Micro | ||
- pub salt: u64, | ||
- pub norm_flags: u64, | ||
- pad: [u64; 5], | ||
-} | ||
- | ||
-pub struct MZapWrapper { | ||
- pub phys: MZapPhys, | ||
- pub chunks: Vec<MZapEntPhys>, // variable size depending on block size | ||
-} | ||
- | ||
-impl FromBytes for MZapWrapper { | ||
- fn from_bytes(data: &[u8]) -> Result<Self, String> { | ||
- if data.len() >= mem::size_of::<MZapPhys>() { | ||
- // Read the first part of the mzap -- its base phys struct | ||
- let mzap_phys = unsafe { ptr::read(data.as_ptr() as *const MZapPhys) }; | ||
- // Read the mzap entries, aka chunks | ||
- let mut mzap_entries = Vec::new(); | ||
- let num_entries = (data.len() - mem::size_of::<MZapPhys>()) / | ||
- mem::size_of::<MZapEntPhys>(); | ||
- for i in 0..num_entries { | ||
- let entry_pos = mem::size_of::<MZapPhys>() + i * mem::size_of::<MZapEntPhys>(); | ||
- let mzap_ent = unsafe { | ||
- ptr::read(data[entry_pos..].as_ptr() as *const MZapEntPhys) | ||
- }; | ||
- mzap_entries.push(mzap_ent); | ||
- } | ||
- Ok(MZapWrapper { | ||
- phys: mzap_phys, | ||
- chunks: mzap_entries, | ||
- }) | ||
- } else { | ||
- Err("Error: needs a proper error message".to_string()) | ||
- } | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for MZapWrapper { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- try!(write!(f, | ||
- "MZapPhys {{\nblock_type: {:?},\nsalt: {:X},\nnorm_flags: {:X},\nchunk: [\n", | ||
- self.phys.block_type, | ||
- self.phys.salt, | ||
- self.phys.norm_flags)); | ||
- for chunk in &self.chunks { | ||
- try!(write!(f, "{:?}\n", chunk)); | ||
- } | ||
- try!(write!(f, "] }}\n")); | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-pub struct MZapEntPhys { | ||
- pub value: u64, | ||
- pub cd: u32, | ||
- pub pad: u16, | ||
- pub name: [u8; MZAP_NAME_LEN], | ||
-} | ||
- | ||
-impl MZapEntPhys { | ||
- pub fn name(&self) -> Option<&str> { | ||
- let mut len = 0; | ||
- for c in &self.name[..] { | ||
- if *c == 0 { | ||
- break; | ||
- } | ||
- len += 1; | ||
- } | ||
- | ||
- str::from_utf8(&self.name[..len]).ok() | ||
- } | ||
-} | ||
- | ||
-impl fmt::Debug for MZapEntPhys { | ||
- fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { | ||
- try!(write!(f, | ||
- "MZapEntPhys {{\nvalue: {:X},\ncd: {:X},\nname: ", | ||
- self.value, | ||
- self.cd)); | ||
- for i in 0..MZAP_NAME_LEN { | ||
- if self.name[i] == 0 { | ||
- break; | ||
- } | ||
- try!(write!(f, "{}", self.name[i] as char)); | ||
- } | ||
- try!(write!(f, "\n}}\n")); | ||
- Ok(()) | ||
- } | ||
-} | ||
- | ||
-/// Fatzap | ||
-#[repr(packed)] | ||
-pub struct ZapPhys { | ||
- pub block_type: ZapObjectType, // ZapObjectType::Header | ||
- pub magic: u64, | ||
- pub ptr_table: ZapTablePhys, | ||
- pub free_block: u64, | ||
- pub num_leafs: u64, | ||
- pub num_entries: u64, | ||
- pub salt: u64, | ||
- pub pad: [u64; 8181], | ||
- pub leafs: [u64; 8192], | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-pub struct ZapTablePhys { | ||
- pub block: u64, | ||
- pub num_blocks: u64, | ||
- pub shift: u64, | ||
- pub next_block: u64, | ||
- pub block_copied: u64, | ||
-} | ||
- | ||
-const ZAP_LEAF_MAGIC: u32 = 0x2AB1EAF; | ||
-const ZAP_LEAF_CHUNKSIZE: usize = 24; | ||
- | ||
-// The amount of space within the chunk available for the array is: | ||
-// chunk size - space for type (1) - space for next pointer (2) | ||
-const ZAP_LEAF_ARRAY_BYTES: usize = ZAP_LEAF_CHUNKSIZE - 3; | ||
- | ||
-// pub struct ZapLeafPhys { | ||
-// pub header: ZapLeafHeader, | ||
-// hash: [u16; ZAP_LEAF_HASH_NUMENTRIES], | ||
-// union zap_leaf_chunk { | ||
-// entry, | ||
-// array, | ||
-// free, | ||
-// } chunks[ZapLeafChunk; ZAP_LEAF_NUMCHUNKS], | ||
-// } | ||
- | ||
-#[repr(packed)] | ||
-pub struct ZapLeafHeader { | ||
- pub block_type: ZapObjectType, // ZapObjectType::Leaf | ||
- pub next: u64, | ||
- pub prefix: u64, | ||
- pub magic: u32, | ||
- pub n_free: u16, | ||
- pub n_entries: u16, | ||
- pub prefix_len: u16, | ||
- pub free_list: u16, | ||
- pad2: [u8; 12], | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-struct ZapLeafEntry { | ||
- leaf_type: u8, | ||
- int_size: u8, | ||
- next: u16, | ||
- name_chunk: u16, | ||
- name_length: u16, | ||
- value_chunk: u16, | ||
- value_length: u16, | ||
- cd: u16, | ||
- pad: [u8; 2], | ||
- hash: u64, | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-struct ZapLeafArray { | ||
- leaf_type: u8, | ||
- array: [u8; ZAP_LEAF_ARRAY_BYTES], | ||
- next: u16, | ||
-} | ||
- | ||
-#[repr(packed)] | ||
-struct ZapLeafFree { | ||
- free_type: u8, | ||
- pad: [u8; ZAP_LEAF_ARRAY_BYTES], | ||
- next: u16, | ||
-} |
38
crates/zfs/zfs.rs
@@ -1,38 +0,0 @@ | ||
-use std::result; | ||
- | ||
-/// The error type used throughout ZFS | ||
-#[derive(Copy, Clone, Debug, PartialEq)] | ||
-pub enum Error { | ||
- NoEntity, | ||
- Invalid, | ||
-} | ||
- | ||
-/// The Result type used throughout ZFS | ||
-pub type Result<T> = result::Result<T, Error>; | ||
- | ||
-/// The following states are written to disk as part of the normal | ||
-/// SPA lifecycle: Active, Exported, Destroyed, Spare, L2Cache. The remaining | ||
-/// states are software abstractions used at various levels to communicate | ||
-/// pool state. | ||
-#[derive(Copy, Clone, PartialEq)] | ||
-pub enum PoolState { | ||
- Active = 0, // In active use | ||
- Exported, // Explicitly exported | ||
- Destroyed, // Explicitly destroyed | ||
- Spare, // Reserved for hot spare use | ||
- L2Cache, // Level 2 ARC device | ||
- Uninitialized, // Internal spa_t state | ||
- Unavailable, // Internal libzfs state | ||
- PotentiallyActive, // Internal libzfs state | ||
-} | ||
- | ||
-/// Internal SPA load state. Used by FMA diagnosis engine. | ||
-#[derive(Copy, Clone, PartialEq)] | ||
-pub enum SpaLoadState { | ||
- None, // no load in progress | ||
- Open, // normal open | ||
- Import, // import in progress | ||
- TryImport, // tryimport in progress | ||
- Recover, // recovery requested | ||
- Error, // load failed | ||
-} |
8
crates/zfs/zil_header.rs
@@ -1,8 +0,0 @@ | ||
-use super::block_ptr::BlockPtr; | ||
- | ||
-#[repr(packed)] | ||
-pub struct ZilHeader { | ||
- claim_txg: u64, | ||
- replay_seq: u64, | ||
- log: BlockPtr, | ||
-} |
950
crates/zfs/zio.rs
@@ -1,950 +0,0 @@ | ||
-use std::{mem, ptr}; | ||
-use std::fs::File; | ||
-use std::io::{Read, Seek, SeekFrom, Write}; | ||
- | ||
-use super::avl; | ||
-use super::block_ptr::BlockPtr; | ||
-use super::dvaddr::DVAddr; | ||
-use super::from_bytes::FromBytes; | ||
-use super::lzjb; | ||
-use super::uberblock::Uberblock; | ||
-use super::zfs; | ||
- | ||
-pub struct Reader { | ||
- pub disk: File, | ||
-} | ||
- | ||
-impl Reader { | ||
- // TODO: Error handling | ||
- pub fn read(&mut self, start: usize, length: usize) -> Vec<u8> { | ||
- let mut ret: Vec<u8> = vec![0; length*512]; | ||
- | ||
- self.disk.seek(SeekFrom::Start(start as u64 * 512)); | ||
- self.disk.read(&mut ret); | ||
- | ||
- return ret; | ||
- } | ||
- | ||
- pub fn write(&mut self, block: usize, data: &[u8; 512]) { | ||
- self.disk.seek(SeekFrom::Start(block as u64 * 512)); | ||
- self.disk.write(data); | ||
- } | ||
- | ||
- pub fn read_dva(&mut self, dva: &DVAddr) -> Vec<u8> { | ||
- self.read(dva.sector() as usize, dva.asize() as usize) | ||
- } | ||
- | ||
- pub fn read_block(&mut self, block_ptr: &BlockPtr) -> Result<Vec<u8>, String> { | ||
- let data = self.read_dva(&block_ptr.dvas[0]); | ||
- match block_ptr.compression() { | ||
- 2 => { | ||
- // compression off | ||
- Ok(data) | ||
- } | ||
- 1 | 3 => { | ||
- // lzjb compression | ||
- let mut decompressed = vec![0; (block_ptr.lsize()*512) as usize]; | ||
- lzjb::decompress(&data, &mut decompressed); | ||
- Ok(decompressed) | ||
- } | ||
- _ => Err("Error: not enough bytes".to_string()), | ||
- } | ||
- } | ||
- | ||
- pub fn read_type<T: FromBytes>(&mut self, block_ptr: &BlockPtr) -> Result<T, String> { | ||
- let data = self.read_block(block_ptr); | ||
- data.and_then(|data| T::from_bytes(&data[..])) | ||
- } | ||
- | ||
- pub fn read_type_array<T: FromBytes>(&mut self, | ||
- block_ptr: &BlockPtr, | ||
- offset: usize) | ||
- -> Result<T, String> { | ||
- let data = self.read_block(block_ptr); | ||
- data.and_then(|data| T::from_bytes(&data[offset * mem::size_of::<T>()..])) | ||
- } | ||
- | ||
- pub fn uber(&mut self) -> Result<Uberblock, String> { | ||
- let mut newest_uberblock: Option<Uberblock> = None; | ||
- for i in 0..128 { | ||
- if let Ok(uberblock) = Uberblock::from_bytes(&self.read(256 + i * 2, 2)) { | ||
- let newest = match newest_uberblock { | ||
- Some(previous) => { | ||
- if uberblock.txg > previous.txg { | ||
- // Found a newer uberblock | ||
- true | ||
- } else { | ||
- false | ||
- } | ||
- } | ||
- // No uberblock yet, so first one we find is the newest | ||
- None => true, | ||
- }; | ||
- | ||
- if newest { | ||
- newest_uberblock = Some(uberblock); | ||
- } | ||
- } | ||
- } | ||
- | ||
- match newest_uberblock { | ||
- Some(uberblock) => Ok(uberblock), | ||
- None => Err("Failed to find valid uberblock".to_string()), | ||
- } | ||
- } | ||
-} | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// pub struct Zio { | ||
-// Core information about this IO | ||
-// bookmark: ZBookmarkPhys, | ||
-// prop: ZioProp, | ||
-// zio_type: Type, | ||
-// child_type: Child, | ||
-// int io_cmd, | ||
-// priority: Priority, | ||
-// reexecute: u8, | ||
-// state: [u8; NUM_WAIT_TYPES], | ||
-// txg: u64, | ||
-// spa_t *io_spa, | ||
-// blkptr_t *io_bp, | ||
-// blkptr_t *io_bp_override, | ||
-// bp_copy: BlockPtr, | ||
-// list_t io_parent_list, | ||
-// list_t io_child_list, | ||
-// zio_link_t *io_walk_link, | ||
-// zio_t *logical, | ||
-// zio_transform_t *io_transform_stack, | ||
-// | ||
-// Callback info | ||
-// ready: DoneFunc, | ||
-// physdone: DoneFunc, | ||
-// done: DoneFunc, | ||
-// private: *void, | ||
-// prev_space_delta: i64, // DMU private | ||
-// bp_orig: BlockPtr, | ||
-// | ||
-// Data represented by this IO | ||
-// void *data, | ||
-// void *orig_data, | ||
-// size: u64, | ||
-// orig_size: u64, | ||
-// | ||
-// Stuff for the vdev stack | ||
-// vdev_t *vd, | ||
-// void *io_vsd, | ||
-// const zio_vsd_ops_t *io_vsd_ops, | ||
-// | ||
-// offset: u64, | ||
-// timestamp: hrtime_t, // submitted at | ||
-// delta: hrtime_t, // vdev queue service delta | ||
-// delay: u64, // vdev disk service delta (ticks) | ||
-// queue_node: avl::NodeId, | ||
-// offset_node: avl::NodeId, | ||
-// | ||
-// Internal pipeline state | ||
-// flags: Flag, | ||
-// stage: State, | ||
-// pipeline: State, | ||
-// orig_flags: ZioFlag, | ||
-// orig_stage: State, | ||
-// orig_pipeline: State, | ||
-// error: zfs::Error, | ||
-// child_error: [zfs::Error; NUM_CHILD_TYPES], | ||
-// children: [[u64; NUM_WAIT_TYPES]; NUM_CHILD_TYPES], | ||
-// child_count: u64, | ||
-// phys_children: u64, | ||
-// parent_count: u64, | ||
-// uint64_t *stall, | ||
-// zio_t *gang_leader, | ||
-// zio_gang_node_t *gang_tree, | ||
-// void *executor, | ||
-// void *waiter, | ||
-// kmutex_t lock, | ||
-// kcondvar_t cv,*/ | ||
-// | ||
-// FMA state | ||
-// zio_cksum_report_t *io_cksum_report, | ||
-// uint64_t io_ena, | ||
-// | ||
-// Taskq dispatching state | ||
-// tqent: TaskqEnt, | ||
-// } | ||
-// | ||
-// impl Zio { | ||
-// pub fn root(spa: Option<&Spa>, zio_done_func_t *done, void *private, flags: Flag) -> Self { | ||
-// Self::null(None, spa, None, done, private, flags) | ||
-// } | ||
-// | ||
-// pub fn read(zio_t *pio, spa_t *spa, const blkptr_t *bp, | ||
-// void *data, uint64_t size, zio_done_func_t *done, void *private, | ||
-// zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) -> Self { | ||
-// zfs_blkptr_verify(spa, bp); | ||
-// | ||
-// let pipeline = | ||
-// if flags & ZIO_FLAG_DDT_CHILD { | ||
-// ZIO_DDT_CHILD_READ_PIPELINE | ||
-// } else { ZIO_READ_PIPELINE }; | ||
-// | ||
-// Self::create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, | ||
-// data, size, done, private, | ||
-// Type::Read, priority, flags, None, 0, zb, | ||
-// State::Open, pipeline) | ||
-// } | ||
-// | ||
-// fn null(pio: Option<&Zio>, spa: Option<&Spa>, vd: Option<&vdev::Vdev>, zio_done_func_t *done, | ||
-// void *private, flags: Flag) -> Self { | ||
-// Self::create(pio, spa, 0, None, None, 0, done, private, | ||
-// Type::Null, Priority::Now, flags, vd, 0, None, | ||
-// State::Open, ZIO_INTERLOCK_PIPELINE) | ||
-// } | ||
-// | ||
-// fn create(zio_t *pio, spa_t *spa, txg: u64, bp: Option<&BlockPtr>, | ||
-// void *data, size: u64, zio_done_func_t *done, void *private, | ||
-// zio_type: Type, priority: Priority, flags: Flag, | ||
-// vd: Option<&vdev::Vdev>, offset: u64, zb: Option<&ZBookmarkPhys>, | ||
-// stage: State, pipeline: State)-> Self { | ||
-// assert!(size <= SPA_MAXBLOCKSIZE); | ||
-// assert!(util::p2_phase(size, SPA_MINBLOCKSIZE) == 0); | ||
-// assert!(util::p2_phase(offset, SPA_MINBLOCKSIZE) == 0); | ||
-// | ||
-// assert!(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); | ||
-// assert!(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); | ||
-// assert!(vd || stage == ZIO_STAGE_OPEN); | ||
-// | ||
-// zio = kmem_cache_alloc(zcache, KM_SLEEP); | ||
-// bzero(zio, sizeof (zt)); | ||
-// | ||
-// mutex_init(&zio->lock, NULL, MUTEX_DEFAULT, NULL); | ||
-// cv_init(&zio->cv, NULL, CV_DEFAULT, NULL); | ||
-// | ||
-// list_create(&zio->parent_list, sizeof (zlink_t), | ||
-// offsetof(zlink_t, zl_parent_node)); | ||
-// list_create(&zio->child_list, sizeof (zlink_t), | ||
-// offsetof(zlink_t, zl_child_node)); | ||
-// | ||
-// let child_type = | ||
-// if vd.is_some() { | ||
-// Child::Vdev | ||
-// } else if flags & ZIO_FLAG_GANG_CHILD { | ||
-// Child::Gang | ||
-// } else if flags & ZIO_FLAG_DDT_CHILD { | ||
-// Child::Ddt | ||
-// } else { | ||
-// Child::Logical | ||
-// }; | ||
-// | ||
-// if let Some(bp) = bp { | ||
-// zio.bp = (blkptr_t *)bp; | ||
-// zio.bp_copy = *bp; | ||
-// zio.bp_orig = *bp; | ||
-// if zio_type != Type::Write || child_type == Child::Ddt { | ||
-// zio.bp = &zio.bp_copy; // so caller can free | ||
-// } | ||
-// if child_type == Child::Logical { | ||
-// zio.logical = zio; | ||
-// } | ||
-// if child_type > Child::Gang && BP_IS_GANG(bp) { | ||
-// pipeline |= ZIO_GANG_STAGES; | ||
-// } | ||
-// } | ||
-// | ||
-// if zb != NULL { | ||
-// zio.bookmark = *zb; | ||
-// } | ||
-// | ||
-// if let Some(pio) = pio { | ||
-// if zio.logical == NULL { | ||
-// zio.logical = pio.logical; | ||
-// } | ||
-// if zio.child_type == Child::Gang { | ||
-// zio.gang_leader = pio.gang_leader; | ||
-// } | ||
-// Self::add_child(pio, zio); | ||
-// } | ||
-// | ||
-// taskq::taskq_init_ent(&zio->tqent); | ||
-// | ||
-// Zio { | ||
-// child_type: child_type, | ||
-// spa: spa, | ||
-// txg: txg, | ||
-// done: done, | ||
-// private: private, | ||
-// zio_type: zio_type, | ||
-// priority: priority, | ||
-// vd: vd, | ||
-// offset: offset, | ||
-// | ||
-// data: data, | ||
-// orig_data: data, | ||
-// size: size, | ||
-// orig_size: size, | ||
-// | ||
-// flags: flags, | ||
-// orig_flags: flags, | ||
-// stage: stage, | ||
-// orig_stage: stage, | ||
-// pipeline: pipeline, | ||
-// orig_pipeline: pipeline, | ||
-// | ||
-// state: [stage >= State::Ready, | ||
-// state >= State::Done], | ||
-// } | ||
-// } | ||
-// | ||
-// fn read_phys(zio_t *pio, vdev_t *vd, offset: u64, size: u64, | ||
-// void *data, int checksum, zio_done_func_t *done, void *private, | ||
-// priority: Priority, zio_flag flags, labels: bool) -> Zio { | ||
-// assert!(vd->vdev_children == 0); | ||
-// assert!(!labels || offset + size <= VDEV_LABEL_START_SIZE || | ||
-// offset >= vd.vdev_psize - VDEV_LABEL_END_SIZE); | ||
-// assert!(offset + size <= vd.vdev_psize); | ||
-// | ||
-// let mut zio = Self::create(pio, vd.vdev_spa, 0, NULL, data, size, done, private, | ||
-// Type::Read, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset, | ||
-// NULL, State::Open, ZIO_READ_PHYS_PIPELINE); | ||
-// | ||
-// zio.prop.checksum = checksum; | ||
-// | ||
-// zio | ||
-// } | ||
-// | ||
-// ========================================================================== | ||
-// Parent/Child relationships | ||
-// ========================================================================== | ||
-// | ||
-// fn add_child(parent: &mut Zio, child: &mut Zio) { | ||
-// zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); | ||
-// int w; | ||
-// | ||
-// Logical I/Os can have logical, gang, or vdev children. | ||
-// Gang I/Os can have gang or vdev children. | ||
-// Vdev I/Os can only have vdev children. | ||
-// The following assert captures all of these constraints. | ||
-// assert!(cio->io_child_type <= pio->io_child_type); | ||
-// | ||
-// zl.parent = parent; | ||
-// zl.child = child; | ||
-// | ||
-// mutex_enter(&child.lock); | ||
-// mutex_enter(&parent.lock); | ||
-// | ||
-// assert!(parent.state[WaitType::Done] == 0); | ||
-// | ||
-// for w in 0..NUM_WAIT_TYPES { | ||
-// parent.children[child.child_type][w] += !child.state[w]; | ||
-// } | ||
-// | ||
-// list_insert_head(&pio->io_child_list, zl); | ||
-// list_insert_head(&cio->io_parent_list, zl); | ||
-// | ||
-// parent.child_count += 1; | ||
-// child.parent_count += 1; | ||
-// | ||
-// mutex_exit(&pio->io_lock); | ||
-// mutex_exit(&cio->io_lock); | ||
-// } | ||
-// | ||
-// ========================================================================== | ||
-// Execute the IO pipeline | ||
-// ========================================================================== | ||
-// | ||
-// fn taskq_dispatch(&mut self, mut tq_type: TaskqType, cut_in_line: bool) { | ||
-// let spa = self.spa; | ||
-// let flags = if cut_in_line { TQ_FRONT } else { 0 }; | ||
-// | ||
-// let zio_type = | ||
-// if self.flags & (FLAG_CONFIG_WRITER | FLAG_PROBE) != 0 { | ||
-// If we're a config writer or a probe, the normal issue and | ||
-// interrupt threads may all be blocked waiting for the config lock. | ||
-// In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. | ||
-// Type::Null | ||
-// } else if self.zio_type == Type::Write && self.vd.is_some() && self.vd.vdev_aux { | ||
-// A similar issue exists for the L2ARC write thread until L2ARC 2.0. | ||
-// Type::Null | ||
-// } else { | ||
-// self.zio_type | ||
-// }; | ||
-// | ||
-// If this is a high priority IO, then use the high priority taskq if | ||
-// available. | ||
-// if self.priority == Priority::Now && spa->spa_zio_taskq[t][tq_type + 1].stqs_count != 0 { | ||
-// tq_type += 1; | ||
-// } | ||
-// | ||
-// assert!(tq_type < NUM_TASKQ_TYPES); | ||
-// | ||
-// NB: We are assuming that the zio can only be dispatched | ||
-// to a single taskq at a time. It would be a grievous error | ||
-// to dispatch the zio to another taskq at the same time. | ||
-// assert!(taskq_empty_ent(&zio.tqent)); | ||
-// spa.taskq_dispatch_ent(zio_type, tq_type, Box::new(|| { self.execute() }), flags, &self.tqent); | ||
-// } | ||
-// | ||
-// fn taskq_member(&self, TaskqType q) -> bool { | ||
-// let spa = self.spa; | ||
-// | ||
-// for t in 0..NUM_ZIO_TYPES { | ||
-// let tqs = &spa.zio_taskq[t][q]; | ||
-// for i in 0..tqs.count { | ||
-// if tqs.taskq[i].member(self.executor) { | ||
-// return true; | ||
-// } | ||
-// } | ||
-// } | ||
-// | ||
-// false | ||
-// } | ||
-// | ||
-// fn issue_async(&self) -> PipelineFlow { | ||
-// self.taskq_dispatch(TaskqType::Issue, false); | ||
-// | ||
-// PipelineFlow::Stop | ||
-// } | ||
-// | ||
-// fn interrupt(&self) { | ||
-// self.taskq_dispatch(TaskqType::Interrupt, false); | ||
-// } | ||
-// | ||
-// Execute the I/O pipeline until one of the following occurs: | ||
-// (1) the I/O completes; (2) the pipeline stalls waiting for | ||
-// dependent child I/Os; (3) the I/O issues, so we're waiting | ||
-// for an I/O completion interrupt; (4) the I/O is delegated by | ||
-// vdev-level caching or aggregation; (5) the I/O is deferred | ||
-// due to vdev-level queueing; (6) the I/O is handed off to | ||
-// another thread. In all cases, the pipeline stops whenever | ||
-// there's no CPU work; it never burns a thread in cv_wait_io(). | ||
-// | ||
-// There's no locking on io_stage because there's no legitimate way | ||
-// for multiple threads to be attempting to process the same I/O. | ||
-// fn execute(&mut self) { | ||
-// self.executor = curthread; | ||
-// | ||
-// while self.stage < State::Done { | ||
-// let mut stage = self.stage; | ||
-// | ||
-// assert!(!MUTEX_HELD(&self.io_lock)); | ||
-// assert!(ISP2(stage)); | ||
-// assert!(self.stall == NULL); | ||
-// while stage & self.pipeline == 0 { | ||
-// stage <<= 1; | ||
-// } | ||
-// | ||
-// assert!(stage <= State::Done); | ||
-// | ||
-// let cut = | ||
-// match stage { | ||
-// State::VdevIoStart => REQUEUE_IO_START_CUT_IN_LINE, | ||
-// _ => false, | ||
-// }; | ||
-// | ||
-// If we are in interrupt context and this pipeline stage | ||
-// will grab a config lock that is held across IO, | ||
-// or may wait for an IO that needs an interrupt thread | ||
-// to complete, issue async to avoid deadlock. | ||
-// | ||
-// For VDEV_IO_START, we cut in line so that the io will | ||
-// be sent to disk promptly. | ||
-// if stage & BLOCKING_STAGES != 0 && self.vd.is_none() && self.taskq_member(TaskqType::Interrupt) { | ||
-// self.taskq_dispatch(TaskqType::Issue, cut); | ||
-// return; | ||
-// } | ||
-// | ||
-// If we executing in the context of the tx_sync_thread, | ||
-// or we are performing pool initialization outside of a | ||
-// zio_taskq[ZIO_TASKQ_ISSUE|ZIO_TASKQ_ISSUE_HIGH] context. | ||
-// Then issue the zio asynchronously to minimize stack usage | ||
-// for these deep call paths. | ||
-// let dp = self.spa.get_dsl_pool(); | ||
-// if (dp && curthread == dp.tx.tx_sync_thread) || | ||
-// (dp && dp.spa.is_initializing() && !self.taskq_member(TaskqType::Issue) && | ||
-// !self.taskq_member(TaskqType::IssueHigh)) { | ||
-// self.taskq_dispatch(TaskqType::Issue, cut); | ||
-// return; | ||
-// }*/ | ||
-// | ||
-// self.stage = stage; | ||
-// let rv = pipeline_stages[highbit64(stage) - 1](self); | ||
-// | ||
-// if rv == PipelineFlow::Stop { | ||
-// return; | ||
-// } | ||
-// | ||
-// assert!(rv == PipelineFlow::Continue); | ||
-// } | ||
-// } | ||
-// | ||
-// pub fn wait(&self) -> zfs::Result<()> { | ||
-// assert!(self.stage == State::Open); | ||
-// assert!(self.executor == NULL); | ||
-// | ||
-// self.waiter = curthread; | ||
-// | ||
-// self.execute(); | ||
-// | ||
-// mutex_enter(&self.lock); | ||
-// while self.executor != NULL { | ||
-// cv_wait_io(&self.cv, &self.lock); | ||
-// } | ||
-// mutex_exit(&self.lock); | ||
-// | ||
-// let error = self.error; | ||
-// self.destroy(); | ||
-// | ||
-// Ok(()) | ||
-// } | ||
-// | ||
-// fn no_wait(&mut self) { | ||
-// assert!(self.executor == NULL); | ||
-// | ||
-// if self.child_type == Child::Logical && self.unique_parent() == NULL { | ||
-// This is a logical async I/O with no parent to wait for it. | ||
-// We add it to the spa_async_root_zio "Godfather" I/O which | ||
-// will ensure they complete prior to unloading the pool. | ||
-// kpreempt_disable(); | ||
-// let pio = self.spa.async_zio_root[CPU_SEQID]; | ||
-// kpreempt_enable(); | ||
-// | ||
-// Self::add_child(pio, self); | ||
-// } | ||
-// | ||
-// self.execute(); | ||
-// } | ||
-// | ||
-// ///////////////////////////////////////////////////////////////////////////////////////////// | ||
-// Pipeline stages | ||
-// ///////////////////////////////////////////////////////////////////////////////////////////// | ||
-// | ||
-// fn read_bp_init(zio_t *zio) -> PipelineFlow { | ||
-// blkptr_t *bp = zio.bp; | ||
-// | ||
-// if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && | ||
-// zio.child_type == Child::Logical && | ||
-// !(zio->io_flags & ZIO_FLAG_RAW)) { | ||
-// uint64_t psize = BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); | ||
-// void *cbuf = zio_buf_alloc(psize); | ||
-// | ||
-// zio_push_transform(zio, cbuf, psize, psize, zio_decompress); | ||
-// } | ||
-// | ||
-// if BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA { | ||
-// zio.pipeline = ZIO_INTERLOCK_PIPELINE; | ||
-// decode_embedded_bp_compressed(bp, zio->io_data); | ||
-// } else { | ||
-// ASSERT(!BP_IS_EMBEDDED(bp)); | ||
-// } | ||
-// | ||
-// if !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0 { | ||
-// zio.flags |= ZIO_FLAG_DONT_CACHE; | ||
-// } | ||
-// | ||
-// if BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP { | ||
-// zio.flags |= ZIO_FLAG_DONT_CACHE; | ||
-// } | ||
-// | ||
-// if BP_GET_DEDUP(bp) && zio.child_type == Child::Logical { | ||
-// zio.pipeline = ZIO_DDT_READ_PIPELINE; | ||
-// } | ||
-// | ||
-// return PipelineFlow::Continue; | ||
-// } | ||
-// | ||
-// Issue an I/O to the underlying vdev. Typically the issue pipeline | ||
-// stops after this stage and will resume upon I/O completion. | ||
-// However, there are instances where the vdev layer may need to | ||
-// continue the pipeline when an I/O was not issued. Since the I/O | ||
-// that was sent to the vdev layer might be different than the one | ||
-// currently active in the pipeline (see vdev_queue_io()), we explicitly | ||
-// force the underlying vdev layers to call either zio_execute() or | ||
-// zio_interrupt() to ensure that the pipeline continues with the correct I/O. | ||
-// fn vdev_io_start(zio_t *zio) -> PipelineFlow { | ||
-// vdev_t *vd = zio.vd; | ||
-// spa_t *spa = zio.spa; | ||
-// | ||
-// assert!(zio.error == 0); | ||
-// assert!(zio.child_error[Child::Vdev] == 0); | ||
-// | ||
-// if vd == NULL { | ||
-// if zio.flags & ZIO_FLAG_CONFIG_WRITER == 0 { | ||
-// spa_config_enter(spa, SCL_ZIO, zio, RW_READER); | ||
-// } | ||
-// | ||
-// The mirror_ops handle multiple DVAs in a single BP. | ||
-// vdev_mirror_ops.vdev_op_start(zio); | ||
-// return PipelineFlow::Stop; | ||
-// } | ||
-// | ||
-// We keep track of time-sensitive I/Os so that the scan thread | ||
-// can quickly react to certain workloads. In particular, we care | ||
-// about non-scrubbing, top-level reads and writes with the following | ||
-// characteristics: | ||
-// - synchronous writes of user data to non-slog devices | ||
-// - any reads of user data | ||
-// When these conditions are met, adjust the timestamp of spa_last_io | ||
-// which allows the scan thread to adjust its workload accordingly. | ||
-// if zio.flags & ZIO_FLAG_SCAN_THREAD == 0 && zio.bp != NULL && vd == vd.top_vdev && | ||
-// !vd.is_log && zio.bookmark.objset != DMU_META_OBJSET && zio.txg != spa.syncing_txg() { | ||
-// let old = spa.spa_last_io; | ||
-// let new = ddi_get_lbolt64(); | ||
-// if old != new { | ||
-// atomic_cas_64(&spa.spa_last_io, old, new); | ||
-// } | ||
-// } | ||
-// | ||
-// let align = 1 << vd.top_vdev.ashift; | ||
-// | ||
-// if zio.flags & ZIO_FLAG_PHYSICAL == 0 && util::p2_phase(zio.size, align) != 0 { | ||
-// Transform logical writes to be a full physical block size. | ||
-// let asize = util::p2_round_up(zio.size, align); | ||
-// char *abuf = zio_buf_alloc(asize); | ||
-// assert!(vd == vd.vdev_top); | ||
-// if (zio.zio_type == Type::Write) { | ||
-// bcopy(zio.data, abuf, zio.size); | ||
-// bzero(abuf + zio.size, asize - zio.size); | ||
-// } | ||
-// zio_push_transform(zio, abuf, asize, asize, zsubblock); | ||
-// } | ||
-// | ||
-// If this is not a physical io, make sure that it is properly aligned | ||
-// before proceeding. | ||
-// if zio.flags & ZIO_FLAG_PHYSICAL == 0 { | ||
-// assert!(util::p2_phase(zio.offset, align) == 0); | ||
-// assert!(util::p2_phase(zio.size, align) == 0); | ||
-// } else { | ||
-// For physical writes, we allow 512b aligned writes and assume | ||
-// the device will perform a read-modify-write as necessary. | ||
-// assert!(util::p2_phase(zio.offset, SPA_MINBLOCKSIZE) == 0); | ||
-// assert!(util::p2_phase(zio.size, SPA_MINBLOCKSIZE) == 0); | ||
-// } | ||
-// | ||
-// VERIFY(zio.zio_type != Type::Write || spa_writeable(spa)); | ||
-// | ||
-// If this is a repair I/O, and there's no self-healing involved -- | ||
-// that is, we're just resilvering what we expect to resilver -- | ||
-// then don't do the I/O unless zio's txg is actually in vd's DTL. | ||
-// This prevents spurious resilvering with nested replication. | ||
-// For example, given a mirror of mirrors, (A+B)+(C+D), if only | ||
-// A is out of date, we'll read from C+D, then use the data to | ||
-// resilver A+B -- but we don't actually want to resilver B, just A. | ||
-// The top-level mirror has no way to know this, so instead we just | ||
-// discard unnecessary repairs as we work our way down the vdev tree. | ||
-// The same logic applies to any form of nested replication: | ||
-// ditto + mirror, RAID-Z + replacing, etc. This covers them all. | ||
-// if (zio.flags & ZIO_FLAG_IO_REPAIR != 0 && | ||
-// zio.flags & ZIO_FLAG_SELF_HEAL == 0 && | ||
-// zio.txg != 0 && /* not a delegated i/o */ | ||
-// !vdev_dtl_contains(vd, DTL_PARTIAL, zio.txg, 1)) { | ||
-// assert!(zio.zio_type == Type::Write); | ||
-// zio_vdev_bypass(zio); | ||
-// return PipelineFlow::Continue; | ||
-// } | ||
-// | ||
-// if vd.ops.is_leaf() && (zio.zio_type == Type::Read || zio.zio_type == Type::Write) { | ||
-// if zio.zio_type == Type::Read && vdev_cache_read(zio) { | ||
-// return PipelineFlow::Continue; | ||
-// } | ||
-// | ||
-// if (zio = vdev_queue_io(zio)) == NULL { | ||
-// return PipelineFlow::Stop; | ||
-// } | ||
-// | ||
-// if !vdev_accessible(vd, zio) { | ||
-// zio.error = SET_ERROR(ENXIO); | ||
-// zio.interrupt(); | ||
-// return PipelineFlow::Stop; | ||
-// } | ||
-// } | ||
-// | ||
-// (vd.ops.io_start)(zio); | ||
-// PipelineFlow::Stop | ||
-// } | ||
-// | ||
-// fn vdev_io_done(zio: &mut Zio) -> PipelineFlow { | ||
-// vdev_t *vd = zio.vd; | ||
-// vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; | ||
-// let mut unexpected_error = false; | ||
-// | ||
-// if zio.wait_for_children(Child::Vdev, WaitType::Done) { | ||
-// return PipelineFlow::Stop; | ||
-// } | ||
-// | ||
-// assert!(zio.zio_type == Type::Read || zio.zio_type == Type::Write); | ||
-// | ||
-// if vd != NULL && vd.ops.is_leaf() { | ||
-// vdev_queue_io_done(zio); | ||
-// | ||
-// if zio.zio_type == Type::Write { | ||
-// vdev_cache_write(zio); | ||
-// } | ||
-// | ||
-// if zio_injection_enabled && zio.error == 0 { | ||
-// zio.error = zio_handle_device_injection(vd, zio, EIO); | ||
-// } | ||
-// | ||
-// if zio_injection_enabled && zio.error == 0 { | ||
-// zio.error = zio_handle_label_injection(zio, EIO); | ||
-// }*/ | ||
-// | ||
-// if zio.error { | ||
-// if !vdev_accessible(vd, zio) { | ||
-// zio.error = SET_ERROR(ENXIO); | ||
-// } else { | ||
-// unexpected_error = true; | ||
-// } | ||
-// } | ||
-// } | ||
-// | ||
-// (ops.io_done)(zio); | ||
-// | ||
-// if unexpected_error { | ||
-// VERIFY(vdev_probe(vd, zio) == NULL); | ||
-// } | ||
-// | ||
-// PipelineFlow::Continue | ||
-// } | ||
-// } | ||
- | ||
-/// ///////////////////////////////////////////////////////////////////////////////////////////////// | ||
- | ||
-// A bookmark is a four-tuple <objset, object, level, blkid> that uniquely | ||
-// identifies any block in the pool. By convention, the meta-objset (MOS) | ||
-// is objset 0, and the meta-dnode is object 0. This covers all blocks | ||
-// except root blocks and ZIL blocks, which are defined as follows: | ||
-// | ||
-// Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>. | ||
-// ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>. | ||
-// dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>. | ||
-// | ||
-// Note: this structure is called a bookmark because its original purpose | ||
-// was to remember where to resume a pool-wide traverse. | ||
-// | ||
-// Note: this structure is passed between userland and the kernel, and is | ||
-// stored on disk (by virtue of being incorporated into other on-disk | ||
-// structures, e.g. dsl_scan_phys_t). | ||
-// | ||
-struct ZbookmarkPhys { | ||
- objset: u64, | ||
- object: u64, | ||
- level: i64, | ||
- blkid: u64, | ||
-} | ||
- | ||
-const REQUEUE_IO_START_CUT_IN_LINE: bool = true; | ||
-pub const NUM_CHILD_TYPES: usize = 4; | ||
-pub const NUM_WAIT_TYPES: usize = 2; | ||
-pub const NUM_TYPES: usize = 6; | ||
-pub const NUM_TASKQ_TYPES: usize = 4; | ||
- | ||
-// Default Linux timeout for a sd device. | ||
-// const ZIO_DELAY_MAX = (30 * MILLISEC); | ||
- | ||
-// const ZIO_FAILURE_MODE_WAIT = 0; | ||
-// const ZIO_FAILURE_MODE_CONTINUE = 1; | ||
-// const ZIO_FAILURE_MODE_PANIC = 2; | ||
- | ||
-// pub enum TaskqType { | ||
-// Issue = 0, | ||
-// IssueHigh, | ||
-// Interrupt, | ||
-// InterruptHigh, | ||
-// } | ||
-// | ||
-// #[derive(Copy, Clone, PartialEq)] | ||
-// enum Priority { | ||
-// SyncRead, | ||
-// SyncWrite, // ZIL | ||
-// AsyncRead, // prefetch | ||
-// AsyncWrite, // spa_sync() | ||
-// Scrub, // asynchronous scrub/resilver reads | ||
-// NumQueueable, | ||
-// | ||
-// Now // non-queued io (e.g. free) | ||
-// } | ||
-// | ||
-// #[derive(Copy, Clone, PartialEq)] | ||
-// pub enum Type { | ||
-// Null = 0, | ||
-// Read, | ||
-// Write, | ||
-// Free, | ||
-// Claim, | ||
-// IoCtl, | ||
-// } | ||
-// | ||
-// const FLAG_AGG_INHERIT: u64 = Flag::CanFail - 1; | ||
-// const FLAG_DDT_INHERIT: u64 = Flag::IoRetry - 1; | ||
-// const FLAG_GANG_INHERIT: u64 = Flag::IoRetry - 1; | ||
-// const FLAG_VDEV_INHERIT: u64 = Flag::DontQueue - 1; | ||
-// | ||
-// const NUM_PIPE_STAGES: usize = 22; | ||
-// | ||
-// type PipeStageFn = fn(&mut Zio) -> zfs::Result<()>; | ||
-// static pipeline_stages: [Option<PipeStageFn>; NUM_PIPE_STAGES] = | ||
-// [None, | ||
-// Some(Zio::read_bp_init), | ||
-// None,//Some(Zio::free_bp_init), | ||
-// Some(Zio::issue_async), | ||
-// None,//Some(Zio::write_bp_init), | ||
-// None,//Some(Zio::checksum_generate), | ||
-// None,//Some(Zio::nop_write), | ||
-// None,//Some(Zio::ddt_read_start), | ||
-// None,//Some(Zio::ddt_read_done), | ||
-// None,//Some(Zio::ddt_write), | ||
-// None,//Some(Zio::ddt_free), | ||
-// None,//Some(Zio::gang_assemble), | ||
-// None,//Some(Zio::gang_issue), | ||
-// None,//Some(Zio::dva_allocate), | ||
-// None,//Some(Zio::dva_free), | ||
-// None,//Some(Zio::dva_claim), | ||
-// Some(Zio::ready), | ||
-// Some(Zio::vdev_io_start), | ||
-// Some(Zio::vdev_io_done), | ||
-// Some(Zio::vdev_io_assess), | ||
-// Some(Zio::checksum_verify), | ||
-// Some(Zio::done)]; | ||
-// | ||
-// #[derive(Copy, Clone, PartialEq)] | ||
-// enum PipelineFlow { | ||
-// Continue = 0x100, | ||
-// Stop = 0x101, | ||
-// } | ||
-// | ||
-// #[derive(Copy, Clone, PartialEq)] | ||
-// enum Flag { | ||
-// Flags inherited by gang, ddt, and vdev children, | ||
-// and that must be equal for two zios to aggregate | ||
-// DontAggregate = 1 << 0, | ||
-// IoRepair = 1 << 1, | ||
-// SelfHeal = 1 << 2, | ||
-// Resilver = 1 << 3, | ||
-// Scrub = 1 << 4, | ||
-// ScanThread = 1 << 5, | ||
-// Physical = 1 << 6, | ||
-// | ||
-// Flags inherited by ddt, gang, and vdev children. | ||
-// CanFail = 1 << 7, // must be first for INHERIT | ||
-// Speculative = 1 << 8, | ||
-// ConfigWriter = 1 << 9, | ||
-// DontRetry = 1 << 10, | ||
-// DontCache = 1 << 11, | ||
-// NoData = 1 << 12, | ||
-// InduceDamage = 1 << 13, | ||
-// | ||
-// Flags inherited by vdev children. | ||
-// IoRetry = 1 << 14, /* must be first for INHERIT */ | ||
-// Probe = 1 << 15, | ||
-// TryHard = 1 << 16, | ||
-// Optional = 1 << 17, | ||
-// | ||
-// Flags not inherited by any children. | ||
-// DontQueue = 1 << 18, /* must be first for INHERIT */ | ||
-// DontPropagate = 1 << 19, | ||
-// IoBypass = 1 << 20, | ||
-// IoRewrite = 1 << 21, | ||
-// Raw = 1 << 22, | ||
-// GangChild = 1 << 23, | ||
-// DdtChild = 1 << 24, | ||
-// GodFather = 1 << 25, | ||
-// NopWrite = 1 << 26, | ||
-// ReExecuted = 1 << 27, | ||
-// Delegated = 1 << 28, | ||
-// FastWrite = 1 << 29, | ||
-// }; | ||
-// | ||
-// #[derive(Copy, Clone, PartialEq)] | ||
-// enum Child { | ||
-// Vdev = 0, | ||
-// Gang, | ||
-// Ddt, | ||
-// Logical, | ||
-// }; | ||
-// | ||
-// #[repr(u8)] | ||
-// enum WaitType { | ||
-// Ready = 0, | ||
-// Done, | ||
-// }; | ||
-// | ||
-// zio pipeline stage definitions | ||
-// enum Stage { | ||
-// Open = 1 << 0, // RWFCI | ||
-// | ||
-// ReadBpInit = 1 << 1, // R---- | ||
-// FreeBpInit = 1 << 2, // --F-- | ||
-// IssueAsync = 1 << 3, // RWF-- | ||
-// WriteBpInit = 1 << 4, // -W--- | ||
-// | ||
-// ChecksumGenerate = 1 << 5, // -W--- | ||
-// | ||
-// NopWrite = 1 << 6, // -W--- | ||
-// | ||
-// DdtReadStart = 1 << 7, // R---- | ||
-// DdtReadDone = 1 << 8, // R---- | ||
-// DdtWrite = 1 << 9, // -W--- | ||
-// DdtFree = 1 << 10, // --F-- | ||
-// | ||
-// GangAssemble = 1 << 11, // RWFC- | ||
-// GangIssue = 1 << 12, // RWFC- | ||
-// | ||
-// DvaAllocate = 1 << 13, // -W--- | ||
-// DvaFree = 1 << 14, // --F-- | ||
-// DvaClaim = 1 << 15, // ---C- | ||
-// | ||
-// Ready = 1 << 16, // RWFCI | ||
-// | ||
-// VdevIoStart = 1 << 17, // RW--I | ||
-// VdevIoDone = 1 << 18, // RW--I | ||
-// VdevIoAssess = 1 << 19, // RW--I | ||
-// | ||
-// ChecksumVerify = 1 << 20, // R---- | ||
-// | ||
-// Done = 1 << 21, // RWFCI | ||
-// }; | ||
-// | ||
-// const INTERLOCK_STAGES = STAGE_READY | STAGE_DONE; | ||
-// | ||
-// const INTERLOCK_PIPELINE = INTERLOCK_STAGES | ||
-// | ||
-// const VDEV_IO_STAGES = STAGE_VDEV_IO_START | | ||
-// STAGE_VDEV_IO_DONE | STAGE_VDEV_IO_ASSESS; | ||
-// | ||
-// const VDEV_CHILD_PIPELINE = VDEV_IO_STAGES | STAGE_DONE; | ||
-// | ||
-// const READ_COMMON_STAGES = INTERLOCK_STAGES | VDEV_IO_STAGES | STAGE_CHECKSUM_VERIFY | ||
-// | ||
-// const READ_PHYS_PIPELINE = READ_COMMON_STAGES | ||
-// | ||
-// const READ_PIPELINE = READ_COMMON_STAGES | STAGE_READ_BP_INIT | ||
-// | ||
-// const DDT_CHILD_READ_PIPELINE = READ_COMMON_STAGES; | ||
-// | ||
-// const DDT_READ_PIPELINE = INTERLOCK_STAGES | STAGE_READ_BP_INIT | STAGE_DDT_READ_START | STAGE_DDT_READ_DONE; | ||
-// | ||
-// const WRITE_COMMON_STAGES = INTERLOCK_STAGES | VDEV_IO_STAGES | STAGE_ISSUE_ASYNC | STAGE_CHECKSUM_GENERATE; | ||
-// | ||
-// const WRITE_PHYS_PIPELINE = WRITE_COMMON_STAGES; | ||
-// | ||
-// const REWRITE_PIPELINE = WRITE_COMMON_STAGES | STAGE_WRITE_BP_INIT; | ||
-// | ||
-// const WRITE_PIPELINE = WRITE_COMMON_STAGES | STAGE_WRITE_BP_INIT | STAGE_DVA_ALLOCATE; | ||
-// | ||
-// const DDT_CHILD_WRITE_PIPELINE = INTERLOCK_STAGES | VDEV_IO_STAGES | STAGE_DVA_ALLOCATE; | ||
-// | ||
-// const DDT_WRITE_PIPELINE = INTERLOCK_STAGES | STAGE_ISSUE_ASYNC | | ||
-// STAGE_WRITE_BP_INIT | STAGE_CHECKSUM_GENERATE | | ||
-// STAGE_DDT_WRITE; | ||
-// | ||
-// const GANG_STAGES = STAGE_GANG_ASSEMBLE | STAGE_GANG_ISSUE; | ||
-// | ||
-// const FREE_PIPELINE = INTERLOCK_STAGES | STAGE_FREE_BP_INIT | STAGE_DVA_FREE; | ||
-// | ||
-// const DDT_FREE_PIPELINE = INTERLOCK_STAGES | STAGE_FREE_BP_INIT | STAGE_ISSUE_ASYNC | STAGE_DDT_FREE; | ||
-// | ||
-// const CLAIM_PIPELINE = INTERLOCK_STAGES | STAGE_DVA_CLAIM; | ||
-// | ||
-// const IOCTL_PIPELINE = INTERLOCK_STAGES | STAGE_VDEV_IO_START | STAGE_VDEV_IO_ASSESS; | ||
-// | ||
-// const BLOCKING_STAGES = STAGE_DVA_ALLOCATE | STAGE_DVA_CLAIM | STAGE_VDEV_IO_START; | ||
-// |
0 comments on commit
cdd3511