btrfs-progs/mkfs/common.c
Qu Wenruo 9de25ff2ae btrfs-progs: mkfs: use proper zoned compatible write for bgt feature
[BUG]
There is a bug report that mkfs.btrfs can not specify block-group-tree
feature along with zoned devices:

  # mkfs.btrfs /dev/nullb0 -O block-group-tree,zoned
  btrfs-progs v6.7.1
  See https://btrfs.readthedocs.io for more information.

  Resetting device zones /dev/nullb0 (40 zones) ...
  NOTE: several default settings have changed in version 5.15, please make sure
        this does not affect your deployments:
        - DUP for metadata (-m dup)
        - enabled no-holes (-O no-holes)
        - enabled free-space-tree (-R free-space-tree)

  ERROR: error during mkfs: Invalid argument

[CAUSE]
During mkfs, we need to write all the 7 or 8 tree blocks into the
metadata zone, and since it's zoned device, we need to fulfill all the
requirement for zoned writes, including:

- All writes must be in sequential bytenr
- Buffer must be aligned to sector size

The sequential bytenr requirement is already met by the mkfs design, but
the second requirement on memory alignment is never met for metadata, as
we put the contents of a leaf in extent_buffer::data[], which is after a
lot of small members.

Thus metadata IO buffer would never be aligned to sector size (normally
4K).
And we require btrfs_pwrite() and btrfs_pread() to handle the memory
alignment for us.

However in create_block_group_tree() we didn't use btrfs_pwrite(), but
plain pwrite() call directly, which would lead to -EINVAL error due to
memory alignment problem.

[FIX]
Just call btrfs_pwrite() instead of the plain pwrite() in
create_block_group_tree().

Issue: #765
Pull-request: #767
Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
Reviewed-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: Qu Wenruo <wqu@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2024-04-18 19:15:43 +02:00

1193 lines
34 KiB
C

/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include <sys/stat.h>
#include <unistd.h>
#include <fcntl.h>
#include <limits.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <uuid/uuid.h>
#include <blkid/blkid.h>
#include "kernel-shared/ctree.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/disk-io.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/transaction.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/zoned.h"
#include "common/fsfeatures.h"
#include "common/internal.h"
#include "common/messages.h"
#include "common/path-utils.h"
#include "common/device-utils.h"
#include "common/open-utils.h"
#include "mkfs/common.h"
static u64 reference_root_table[] = {
[MKFS_ROOT_TREE] = BTRFS_ROOT_TREE_OBJECTID,
[MKFS_EXTENT_TREE] = BTRFS_EXTENT_TREE_OBJECTID,
[MKFS_CHUNK_TREE] = BTRFS_CHUNK_TREE_OBJECTID,
[MKFS_DEV_TREE] = BTRFS_DEV_TREE_OBJECTID,
[MKFS_FS_TREE] = BTRFS_FS_TREE_OBJECTID,
[MKFS_CSUM_TREE] = BTRFS_CSUM_TREE_OBJECTID,
[MKFS_FREE_SPACE_TREE] = BTRFS_FREE_SPACE_TREE_OBJECTID,
[MKFS_BLOCK_GROUP_TREE] = BTRFS_BLOCK_GROUP_TREE_OBJECTID,
};
static int btrfs_write_empty_tree(int fd, struct btrfs_mkfs_config *cfg,
struct extent_buffer *buf, u64 objectid,
u64 block)
{
int ret;
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
btrfs_set_header_bytenr(buf, block);
btrfs_set_header_owner(buf, objectid);
btrfs_set_header_nritems(buf, 0);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize, block, cfg->zone_size);
if (ret != cfg->nodesize)
return ret < 0 ? -errno : -EIO;
return 0;
}
static int btrfs_create_tree_root(int fd, struct btrfs_mkfs_config *cfg,
struct extent_buffer *buf,
const enum btrfs_mkfs_block *blocks,
int blocks_nr)
{
struct btrfs_root_item root_item;
struct btrfs_inode_item *inode_item;
struct btrfs_disk_key disk_key;
u32 nritems = 0;
u32 itemoff;
int ret = 0;
int blk;
int i;
u8 uuid[BTRFS_UUID_SIZE];
bool block_group_tree = !!(cfg->features.compat_ro_flags &
BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE);
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
memset(&root_item, 0, sizeof(root_item));
memset(&disk_key, 0, sizeof(disk_key));
/* create the items for the root tree */
inode_item = &root_item.inode;
btrfs_set_stack_inode_generation(inode_item, 1);
btrfs_set_stack_inode_size(inode_item, 3);
btrfs_set_stack_inode_nlink(inode_item, 1);
btrfs_set_stack_inode_nbytes(inode_item, cfg->nodesize);
btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
btrfs_set_root_refs(&root_item, 1);
btrfs_set_root_used(&root_item, cfg->nodesize);
btrfs_set_root_generation(&root_item, 1);
btrfs_set_disk_key_type(&disk_key, BTRFS_ROOT_ITEM_KEY);
btrfs_set_disk_key_offset(&disk_key, 0);
itemoff = cfg->leaf_data_size - sizeof(root_item);
for (i = 0; i < blocks_nr; i++) {
blk = blocks[i];
if (blk == MKFS_ROOT_TREE || blk == MKFS_CHUNK_TREE)
continue;
if (!block_group_tree && blk == MKFS_BLOCK_GROUP_TREE)
continue;
btrfs_set_root_bytenr(&root_item, cfg->blocks[blk]);
btrfs_set_disk_key_objectid(&disk_key,
reference_root_table[blk]);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, sizeof(root_item));
if (blk == MKFS_FS_TREE) {
time_t now = time(NULL);
uuid_generate(uuid);
memcpy(root_item.uuid, uuid, BTRFS_UUID_SIZE);
btrfs_set_stack_timespec_sec(&root_item.otime, now);
btrfs_set_stack_timespec_sec(&root_item.ctime, now);
} else {
memset(uuid, 0, BTRFS_UUID_SIZE);
memcpy(root_item.uuid, uuid, BTRFS_UUID_SIZE);
btrfs_set_stack_timespec_sec(&root_item.otime, 0);
btrfs_set_stack_timespec_sec(&root_item.ctime, 0);
}
write_extent_buffer(buf, &root_item,
btrfs_item_ptr_offset(buf, nritems),
sizeof(root_item));
nritems++;
itemoff -= sizeof(root_item);
}
btrfs_set_header_nritems(buf, nritems);
/* generate checksum */
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
/* write back root tree */
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_ROOT_TREE], cfg->zone_size);
if (ret != cfg->nodesize)
return (ret < 0 ? -errno : -EIO);
return ret;
}
static int create_free_space_tree(int fd, struct btrfs_mkfs_config *cfg,
struct extent_buffer *buf, u64 group_start,
u64 group_size, u64 free_start)
{
struct btrfs_free_space_info *info;
struct btrfs_disk_key disk_key;
int itemoff = cfg->leaf_data_size;
int nritems = 0;
int ret;
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
itemoff -= sizeof(*info);
btrfs_set_disk_key_objectid(&disk_key, group_start);
btrfs_set_disk_key_offset(&disk_key, group_size);
btrfs_set_disk_key_type(&disk_key, BTRFS_FREE_SPACE_INFO_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, sizeof(*info));
info = btrfs_item_ptr(buf, nritems, struct btrfs_free_space_info);
btrfs_set_free_space_extent_count(buf, info, 1);
btrfs_set_free_space_flags(buf, info, 0);
nritems++;
btrfs_set_disk_key_objectid(&disk_key, free_start);
btrfs_set_disk_key_offset(&disk_key, group_start + group_size - free_start);
btrfs_set_disk_key_type(&disk_key, BTRFS_FREE_SPACE_EXTENT_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, 0);
nritems++;
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_FREE_SPACE_TREE]);
btrfs_set_header_owner(buf, BTRFS_FREE_SPACE_TREE_OBJECTID);
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_FREE_SPACE_TREE], cfg->zone_size);
if (ret != cfg->nodesize)
return ret < 0 ? -errno : -EIO;
return 0;
}
static void write_block_group_item(struct extent_buffer *buf, u32 nr,
u64 objectid, u64 offset, u64 used,
u64 chunk_objectid, u32 itemoff)
{
struct btrfs_block_group_item *bg_item;
struct btrfs_disk_key disk_key;
btrfs_set_disk_key_objectid(&disk_key, objectid);
btrfs_set_disk_key_offset(&disk_key, offset);
btrfs_set_disk_key_type(&disk_key, BTRFS_BLOCK_GROUP_ITEM_KEY);
btrfs_set_item_key(buf, &disk_key, nr);
btrfs_set_item_offset(buf, nr, itemoff);
btrfs_set_item_size(buf, nr, sizeof(*bg_item));
bg_item = btrfs_item_ptr(buf, nr, struct btrfs_block_group_item);
btrfs_set_block_group_used(buf, bg_item, used);
btrfs_set_block_group_flags(buf, bg_item, BTRFS_BLOCK_GROUP_SYSTEM);
btrfs_set_block_group_chunk_objectid(buf, bg_item, chunk_objectid);
}
static int create_block_group_tree(int fd, struct btrfs_mkfs_config *cfg,
struct extent_buffer *buf,
u64 bg_offset, u64 bg_size, u64 bg_used)
{
int ret;
u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
/*
* For extent-tree-v2, chunk_objectid of block group item is reused
* to indicate which extent-tree the block group is in.
*
* Thus for the initial image, we should set the chunk_objectid to 0,
* as all initial bgs are in the extent tree with global id 0.
*/
if (cfg->features.incompat_flags & BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
chunk_objectid = 0;
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
write_block_group_item(buf, 0, bg_offset, bg_size, bg_used,
chunk_objectid, cfg->leaf_data_size -
sizeof(struct btrfs_block_group_item));
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_BLOCK_GROUP_TREE]);
btrfs_set_header_owner(buf, BTRFS_BLOCK_GROUP_TREE_OBJECTID);
btrfs_set_header_nritems(buf, 1);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_BLOCK_GROUP_TREE], cfg->zone_size);
if (ret != cfg->nodesize)
return ret < 0 ? -errno : -EIO;
return 0;
}
static u64 zoned_system_group_offset(u64 zone_size)
{
const int zone_shift = ilog2(zone_size);
u32 zone_num = BTRFS_NR_SB_LOG_ZONES;
u64 start = (u64)zone_num * zone_size;
u32 sb_zones[BTRFS_SUPER_MIRROR_MAX];
int i;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++)
sb_zones[i] = sb_zone_number(zone_shift, i);
for (;;) {
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
if (zone_num == sb_zones[i] ||
!(btrfs_sb_offset(i) + BTRFS_SUPER_INFO_SIZE <= start ||
start + zone_size <= btrfs_sb_offset(i)))
goto next;
}
return start;
next:
zone_num++;
start += zone_size;
}
__builtin_unreachable();
}
/*
* Add @block into the @blocks array.
*
* The @blocks should already be in ascending order and no duplicate.
*/
static void mkfs_blocks_add(enum btrfs_mkfs_block *blocks, int *blocks_nr,
enum btrfs_mkfs_block to_add)
{
int i;
for (i = 0; i < *blocks_nr; i++) {
/* The target is already in the array. */
if (blocks[i] == to_add)
return;
/*
* We find the first one past @to_add, move the array one slot
* right, insert a new one.
*/
if (blocks[i] > to_add) {
memmove(blocks + i + 1, blocks + i, *blocks_nr - i);
blocks[i] = to_add;
(*blocks_nr)++;
return;
}
/* Current one still smaller than @to_add, go to next slot. */
}
/* All slots iterated and not match, insert into the last slot. */
blocks[i] = to_add;
(*blocks_nr)++;
return;
}
/*
* Remove @block from the @blocks array.
*
* The @blocks should already be in ascending order and no duplicate.
*/
static void mkfs_blocks_remove(enum btrfs_mkfs_block *blocks, int *blocks_nr,
enum btrfs_mkfs_block to_remove)
{
int i;
for (i = 0; i < *blocks_nr; i++) {
/* Found the target, move the array one slot left. */
if (blocks[i] == to_remove) {
memmove(blocks + i, blocks + i + 1, *blocks_nr - i - 1);
(*blocks_nr)--;
}
}
/* Nothing found, exit directly. */
return;
}
/*
* @fs_uuid - if NULL, generates a UUID, returns back the new filesystem UUID
*
* The superblock signature is not valid, denotes a partially created
* filesystem, needs to be finalized.
*
* The temporary fs will have the following chunk layout:
* Device extent:
* 0 1M 5M ......
* | Reserved | dev extent for SYS chunk |
*
* And chunk mapping will be:
* Chunk mapping:
* 0 1M 5M
* | | System chunk, 1:1 mapped |
*
* That's to say, there will only be *ONE* system chunk, mapped to
* [1M, 5M) physical offset.
* And the only chunk is also in logical address [1M, 5M), containing
* all essential tree blocks.
*/
int make_btrfs(int fd, struct btrfs_mkfs_config *cfg)
{
struct btrfs_super_block super;
struct extent_buffer *buf;
struct btrfs_disk_key disk_key;
struct btrfs_extent_item *extent_item;
struct btrfs_chunk *chunk;
struct btrfs_dev_item *dev_item;
struct btrfs_dev_extent *dev_extent;
enum btrfs_mkfs_block blocks[MKFS_BLOCK_COUNT];
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
u8 *ptr;
int i;
int ret;
int blocks_nr;
int blk;
u32 itemoff;
u32 nritems = 0;
u64 first_free;
u64 ref_root;
u32 array_size;
u32 item_size;
u64 total_used = 0;
int skinny_metadata = !!(cfg->features.incompat_flags &
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA);
u64 num_bytes;
u64 system_group_offset = BTRFS_BLOCK_RESERVED_1M_FOR_SUPER;
u64 system_group_size = BTRFS_MKFS_SYSTEM_GROUP_SIZE;
bool add_block_group = true;
bool free_space_tree = !!(cfg->features.compat_ro_flags &
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
bool block_group_tree = !!(cfg->features.compat_ro_flags &
BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE);
bool extent_tree_v2 = !!(cfg->features.incompat_flags &
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2);
memcpy(blocks, default_blocks,
sizeof(enum btrfs_mkfs_block) * ARRAY_SIZE(default_blocks));
blocks_nr = ARRAY_SIZE(default_blocks);
/*
* Add one new block for block group tree.
* And for block group tree, we don't need to add block group item
* into extent tree, the item will be handled in block group tree
* initialization.
*/
if (block_group_tree) {
mkfs_blocks_add(blocks, &blocks_nr, MKFS_BLOCK_GROUP_TREE);
add_block_group = false;
}
/* Don't include the free space tree in the blocks to process. */
if (!free_space_tree)
mkfs_blocks_remove(blocks, &blocks_nr, MKFS_FREE_SPACE_TREE);
if ((cfg->features.incompat_flags & BTRFS_FEATURE_INCOMPAT_ZONED)) {
system_group_offset = zoned_system_group_offset(cfg->zone_size);
system_group_size = cfg->zone_size;
}
buf = malloc(sizeof(*buf) + max(cfg->sectorsize, cfg->nodesize));
if (!buf)
return -ENOMEM;
first_free = BTRFS_SUPER_INFO_OFFSET + cfg->sectorsize * 2 - 1;
first_free &= ~((u64)cfg->sectorsize - 1);
memset(&super, 0, sizeof(super));
num_bytes = (cfg->num_bytes / cfg->sectorsize) * cfg->sectorsize;
if (!*cfg->fs_uuid) {
uuid_generate(super.fsid);
uuid_unparse(super.fsid, cfg->fs_uuid);
} else {
uuid_parse(cfg->fs_uuid, super.fsid);
}
if (!*cfg->dev_uuid) {
uuid_generate(super.dev_item.uuid);
uuid_unparse(super.dev_item.uuid, cfg->dev_uuid);
} else {
uuid_parse(cfg->dev_uuid, super.dev_item.uuid);
}
uuid_generate(chunk_tree_uuid);
for (i = 0; i < blocks_nr; i++) {
blk = blocks[i];
cfg->blocks[blk] = system_group_offset + cfg->nodesize * i;
total_used += cfg->nodesize;
}
btrfs_set_super_bytenr(&super, BTRFS_SUPER_INFO_OFFSET);
btrfs_set_super_num_devices(&super, 1);
btrfs_set_super_magic(&super, BTRFS_MAGIC_TEMPORARY);
btrfs_set_super_generation(&super, 1);
btrfs_set_super_root(&super, cfg->blocks[MKFS_ROOT_TREE]);
btrfs_set_super_chunk_root(&super, cfg->blocks[MKFS_CHUNK_TREE]);
btrfs_set_super_total_bytes(&super, num_bytes);
btrfs_set_super_bytes_used(&super, total_used);
btrfs_set_super_sectorsize(&super, cfg->sectorsize);
super.__unused_leafsize = cpu_to_le32(cfg->nodesize);
btrfs_set_super_nodesize(&super, cfg->nodesize);
btrfs_set_super_stripesize(&super, cfg->stripesize);
btrfs_set_super_csum_type(&super, cfg->csum_type);
btrfs_set_super_chunk_root_generation(&super, 1);
if (cfg->features.incompat_flags & BTRFS_FEATURE_INCOMPAT_ZONED)
btrfs_set_super_cache_generation(&super, 0);
else
btrfs_set_super_cache_generation(&super, -1);
btrfs_set_super_incompat_flags(&super, cfg->features.incompat_flags);
if (free_space_tree)
btrfs_set_super_cache_generation(&super, 0);
btrfs_set_super_compat_ro_flags(&super, cfg->features.compat_ro_flags);
if (extent_tree_v2)
btrfs_set_super_nr_global_roots(&super, 1);
if (cfg->label)
__strncpy_null(super.label, cfg->label, BTRFS_LABEL_SIZE - 1);
/* create the tree of root objects */
memset(buf->data, 0, cfg->nodesize);
buf->len = cfg->nodesize;
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_ROOT_TREE]);
btrfs_set_header_generation(buf, 1);
btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
btrfs_set_header_owner(buf, BTRFS_ROOT_TREE_OBJECTID);
write_extent_buffer(buf, super.fsid, btrfs_header_fsid(),
BTRFS_FSID_SIZE);
write_extent_buffer(buf, chunk_tree_uuid,
btrfs_header_chunk_tree_uuid(buf),
BTRFS_UUID_SIZE);
ret = btrfs_create_tree_root(fd, cfg, buf, blocks, blocks_nr);
if (ret < 0)
goto out;
/* create the items for the extent tree */
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
nritems = 0;
itemoff = cfg->leaf_data_size;
for (i = 0; i < blocks_nr; i++) {
blk = blocks[i];
/* Add the block group item for our temporary chunk. */
if (cfg->blocks[blk] > system_group_offset && add_block_group) {
itemoff -= sizeof(struct btrfs_block_group_item);
write_block_group_item(buf, nritems,
system_group_offset,
system_group_size, total_used,
BTRFS_FIRST_CHUNK_TREE_OBJECTID,
itemoff);
add_block_group = false;
nritems++;
}
item_size = sizeof(struct btrfs_extent_item);
if (!skinny_metadata)
item_size += sizeof(struct btrfs_tree_block_info);
if (cfg->blocks[blk] < first_free) {
error("block[%d] below first free: %llu < %llu",
i, cfg->blocks[blk], first_free);
ret = -EINVAL;
goto out;
}
if (i > 0 && cfg->blocks[blk] < cfg->blocks[blocks[i - 1]]) {
error("blocks %d and %d in reverse order: %llu < %llu",
blk, blocks[i - 1],
cfg->blocks[blk], cfg->blocks[blocks[i - 1]]);
ret = -EINVAL;
goto out;
}
/* create extent item */
itemoff -= item_size;
btrfs_set_disk_key_objectid(&disk_key, cfg->blocks[blk]);
if (skinny_metadata) {
btrfs_set_disk_key_type(&disk_key,
BTRFS_METADATA_ITEM_KEY);
btrfs_set_disk_key_offset(&disk_key, 0);
} else {
btrfs_set_disk_key_type(&disk_key,
BTRFS_EXTENT_ITEM_KEY);
btrfs_set_disk_key_offset(&disk_key, cfg->nodesize);
}
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, item_size);
extent_item = btrfs_item_ptr(buf, nritems,
struct btrfs_extent_item);
btrfs_set_extent_refs(buf, extent_item, 1);
btrfs_set_extent_generation(buf, extent_item, 1);
btrfs_set_extent_flags(buf, extent_item,
BTRFS_EXTENT_FLAG_TREE_BLOCK);
nritems++;
/* create extent ref */
ref_root = reference_root_table[blk];
btrfs_set_disk_key_objectid(&disk_key, cfg->blocks[blk]);
btrfs_set_disk_key_offset(&disk_key, ref_root);
btrfs_set_disk_key_type(&disk_key, BTRFS_TREE_BLOCK_REF_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, 0);
nritems++;
}
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_EXTENT_TREE]);
btrfs_set_header_owner(buf, BTRFS_EXTENT_TREE_OBJECTID);
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_EXTENT_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
}
/* create the chunk tree */
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
nritems = 0;
item_size = sizeof(*dev_item);
itemoff = cfg->leaf_data_size - item_size;
/* first device 1 (there is no device 0) */
btrfs_set_disk_key_objectid(&disk_key, BTRFS_DEV_ITEMS_OBJECTID);
btrfs_set_disk_key_offset(&disk_key, 1);
btrfs_set_disk_key_type(&disk_key, BTRFS_DEV_ITEM_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, item_size);
dev_item = btrfs_item_ptr(buf, nritems, struct btrfs_dev_item);
btrfs_set_device_id(buf, dev_item, 1);
btrfs_set_device_generation(buf, dev_item, 0);
btrfs_set_device_total_bytes(buf, dev_item, num_bytes);
btrfs_set_device_bytes_used(buf, dev_item, system_group_size);
btrfs_set_device_io_align(buf, dev_item, cfg->sectorsize);
btrfs_set_device_io_width(buf, dev_item, cfg->sectorsize);
btrfs_set_device_sector_size(buf, dev_item, cfg->sectorsize);
btrfs_set_device_type(buf, dev_item, 0);
write_extent_buffer(buf, super.dev_item.uuid,
(unsigned long)btrfs_device_uuid(dev_item),
BTRFS_UUID_SIZE);
write_extent_buffer(buf, super.fsid,
(unsigned long)btrfs_device_fsid(dev_item),
BTRFS_UUID_SIZE);
read_extent_buffer(buf, &super.dev_item, (unsigned long)dev_item,
sizeof(*dev_item));
nritems++;
item_size = btrfs_chunk_item_size(1);
itemoff = itemoff - item_size;
/* then we have chunk 0 */
btrfs_set_disk_key_objectid(&disk_key, BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_disk_key_offset(&disk_key, system_group_offset);
btrfs_set_disk_key_type(&disk_key, BTRFS_CHUNK_ITEM_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, item_size);
chunk = btrfs_item_ptr(buf, nritems, struct btrfs_chunk);
btrfs_set_chunk_length(buf, chunk, system_group_size);
btrfs_set_chunk_owner(buf, chunk, BTRFS_EXTENT_TREE_OBJECTID);
btrfs_set_chunk_stripe_len(buf, chunk, BTRFS_STRIPE_LEN);
btrfs_set_chunk_type(buf, chunk, BTRFS_BLOCK_GROUP_SYSTEM);
btrfs_set_chunk_io_align(buf, chunk, cfg->sectorsize);
btrfs_set_chunk_io_width(buf, chunk, cfg->sectorsize);
btrfs_set_chunk_sector_size(buf, chunk, cfg->sectorsize);
btrfs_set_chunk_num_stripes(buf, chunk, 1);
btrfs_set_stripe_devid_nr(buf, chunk, 0, 1);
btrfs_set_stripe_offset_nr(buf, chunk, 0,
system_group_offset);
nritems++;
write_extent_buffer(buf, super.dev_item.uuid,
(unsigned long)btrfs_stripe_dev_uuid(&chunk->stripe),
BTRFS_UUID_SIZE);
/* copy the key for the chunk to the system array */
ptr = super.sys_chunk_array;
array_size = sizeof(disk_key);
memcpy(ptr, &disk_key, sizeof(disk_key));
ptr += sizeof(disk_key);
/* copy the chunk to the system array */
read_extent_buffer(buf, ptr, (unsigned long)chunk, item_size);
array_size += item_size;
ptr += item_size;
btrfs_set_super_sys_array_size(&super, array_size);
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_CHUNK_TREE]);
btrfs_set_header_owner(buf, BTRFS_CHUNK_TREE_OBJECTID);
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_CHUNK_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
}
/* create the device tree */
memset(buf->data + sizeof(struct btrfs_header), 0,
cfg->nodesize - sizeof(struct btrfs_header));
nritems = 0;
itemoff = cfg->leaf_data_size - sizeof(struct btrfs_dev_extent);
btrfs_set_disk_key_objectid(&disk_key, 1);
btrfs_set_disk_key_offset(&disk_key, system_group_offset);
btrfs_set_disk_key_type(&disk_key, BTRFS_DEV_EXTENT_KEY);
btrfs_set_item_key(buf, &disk_key, nritems);
btrfs_set_item_offset(buf, nritems, itemoff);
btrfs_set_item_size(buf, nritems, sizeof(struct btrfs_dev_extent));
dev_extent = btrfs_item_ptr(buf, nritems, struct btrfs_dev_extent);
btrfs_set_dev_extent_chunk_tree(buf, dev_extent,
BTRFS_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_objectid(buf, dev_extent,
BTRFS_FIRST_CHUNK_TREE_OBJECTID);
btrfs_set_dev_extent_chunk_offset(buf, dev_extent,
system_group_offset);
write_extent_buffer(buf, chunk_tree_uuid,
(unsigned long)btrfs_dev_extent_chunk_tree_uuid(dev_extent),
BTRFS_UUID_SIZE);
btrfs_set_dev_extent_length(buf, dev_extent, system_group_size);
nritems++;
btrfs_set_header_bytenr(buf, cfg->blocks[MKFS_DEV_TREE]);
btrfs_set_header_owner(buf, BTRFS_DEV_TREE_OBJECTID);
btrfs_set_header_nritems(buf, nritems);
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = btrfs_pwrite(fd, buf->data, cfg->nodesize,
cfg->blocks[MKFS_DEV_TREE], cfg->zone_size);
if (ret != cfg->nodesize) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
}
/* create the FS root */
ret = btrfs_write_empty_tree(fd, cfg, buf, BTRFS_FS_TREE_OBJECTID,
cfg->blocks[MKFS_FS_TREE]);
if (ret)
goto out;
/* finally create the csum root */
ret = btrfs_write_empty_tree(fd, cfg, buf, BTRFS_CSUM_TREE_OBJECTID,
cfg->blocks[MKFS_CSUM_TREE]);
if (ret)
goto out;
if (free_space_tree) {
ret = create_free_space_tree(fd, cfg, buf, system_group_offset,
system_group_size,
system_group_offset + total_used);
if (ret)
goto out;
}
if (block_group_tree) {
ret = create_block_group_tree(fd, cfg, buf,
system_group_offset,
system_group_size, total_used);
if (ret)
goto out;
}
/* and write out the super block */
memset(buf->data, 0, BTRFS_SUPER_INFO_SIZE);
memcpy(buf->data, &super, sizeof(super));
buf->len = BTRFS_SUPER_INFO_SIZE;
csum_tree_block_size(buf, btrfs_csum_type_size(cfg->csum_type), 0,
cfg->csum_type);
ret = sbwrite(fd, buf->data, BTRFS_SUPER_INFO_OFFSET);
if (ret != BTRFS_SUPER_INFO_SIZE) {
ret = (ret < 0 ? -errno : -EIO);
goto out;
}
ret = fsync(fd);
if (ret)
goto out;
ret = 0;
out:
free(buf);
return ret;
}
int btrfs_make_root_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid)
{
int ret;
struct btrfs_inode_item inode_item;
time_t now = time(NULL);
memset(&inode_item, 0, sizeof(inode_item));
btrfs_set_stack_inode_generation(&inode_item, trans->transid);
btrfs_set_stack_inode_size(&inode_item, 0);
btrfs_set_stack_inode_nlink(&inode_item, 1);
btrfs_set_stack_inode_nbytes(&inode_item, root->fs_info->nodesize);
btrfs_set_stack_inode_mode(&inode_item, S_IFDIR | 0755);
btrfs_set_stack_timespec_sec(&inode_item.atime, now);
btrfs_set_stack_timespec_nsec(&inode_item.atime, 0);
btrfs_set_stack_timespec_sec(&inode_item.ctime, now);
btrfs_set_stack_timespec_nsec(&inode_item.ctime, 0);
btrfs_set_stack_timespec_sec(&inode_item.mtime, now);
btrfs_set_stack_timespec_nsec(&inode_item.mtime, 0);
btrfs_set_stack_timespec_sec(&inode_item.otime, now);
btrfs_set_stack_timespec_nsec(&inode_item.otime, 0);
if (root->fs_info->tree_root == root)
btrfs_set_super_root_dir(root->fs_info->super_copy, objectid);
ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
if (ret)
goto error;
ret = btrfs_insert_inode_ref(trans, root, "..", 2, objectid, objectid, 0);
if (ret)
goto error;
btrfs_set_root_dirid(&root->root_item, objectid);
ret = 0;
error:
return ret;
}
/*
* Btrfs minimum size calculation is complicated, it should include at least:
* 1. system group size
* 2. minimum global block reserve
* 3. metadata used at mkfs
* 4. space reservation to create uuid for first mount.
* Also, raid factor should also be taken into consideration.
* To avoid the overkill calculation, (system group + global block rsv) * 2
* for *EACH* device should be good enough.
*/
static u64 btrfs_min_global_blk_rsv_size(u32 nodesize)
{
return (u64)nodesize << 10;
}
u64 btrfs_min_dev_size(u32 nodesize, int mixed, u64 meta_profile,
u64 data_profile)
{
u64 reserved = 0;
u64 meta_size;
u64 data_size;
if (mixed)
return 2 * (BTRFS_MKFS_SYSTEM_GROUP_SIZE +
btrfs_min_global_blk_rsv_size(nodesize));
/*
* Minimal size calculation is complex due to several factors:
* 0) Reserved 1M range.
*
* 1) Temporary chunk reuse
* If specified chunk profile is SINGLE, we can reuse
* temporary chunks, no need to allocate new chunks.
*
* 2) Different minimal chunk size for different profiles:
* For initial sys chunk, chunk size is fixed to 4M.
* For single profile, minimal chunk size is 8M for all.
* For other profiles, minimal chunk and stripe size ranges from 8M
* to 64M.
*
* To calculate it a little easier, here we assume we don't reuse any
* temporary chunk, and calculate the size completely by ourselves.
*
* Temporary chunks sizes are always fixed:
* One initial sys chunk, one SINGLE meta, and one SINGLE data.
* The latter two are all 8M, according to @calc_size of
* btrfs_alloc_chunk().
*/
reserved += BTRFS_BLOCK_RESERVED_1M_FOR_SUPER +
BTRFS_MKFS_SYSTEM_GROUP_SIZE + SZ_8M * 2;
/*
* For real chunks, we need to select different sizes:
* For SINGLE, it's still fixed to 8M (@calc_size).
* For other profiles, refer to max(@min_stripe_size, @calc_size).
*
* And use the stripe size to calculate its physical used space.
*/
if (meta_profile & BTRFS_BLOCK_GROUP_PROFILE_MASK)
meta_size = SZ_8M + SZ_32M;
else
meta_size = SZ_8M + SZ_8M;
/* For DUP/metadata, 2 stripes on one disk */
if (meta_profile & BTRFS_BLOCK_GROUP_DUP)
meta_size *= 2;
reserved += meta_size;
if (data_profile & BTRFS_BLOCK_GROUP_PROFILE_MASK)
data_size = SZ_64M;
else
data_size = SZ_8M;
/* For DUP/data, 2 stripes on one disk */
if (data_profile & BTRFS_BLOCK_GROUP_DUP)
data_size *= 2;
reserved += data_size;
return reserved;
}
#define isoctal(c) (((c) & ~7) == '0')
static inline void translate(char *f, char *t)
{
while (*f != '\0') {
if (*f == '\\' &&
isoctal(f[1]) && isoctal(f[2]) && isoctal(f[3])) {
*t++ = 64*(f[1] & 7) + 8*(f[2] & 7) + (f[3] & 7);
f += 4;
} else
*t++ = *f++;
}
*t = '\0';
return;
}
/*
* Checks if the swap device.
* Returns 1 if swap device, < 0 on error or 0 if not swap device.
*/
static int is_swap_device(const char *file)
{
FILE *f;
struct stat st_buf;
dev_t dev;
ino_t ino = 0;
char tmp[PATH_MAX];
char buf[PATH_MAX];
char *cp;
int ret = 0;
if (stat(file, &st_buf) < 0)
return -errno;
if (S_ISBLK(st_buf.st_mode))
dev = st_buf.st_rdev;
else if (S_ISREG(st_buf.st_mode)) {
dev = st_buf.st_dev;
ino = st_buf.st_ino;
} else
return 0;
if ((f = fopen("/proc/swaps", "r")) == NULL)
return 0;
/* skip the first line */
if (fgets(tmp, sizeof(tmp), f) == NULL)
goto out;
while (fgets(tmp, sizeof(tmp), f) != NULL) {
if ((cp = strchr(tmp, ' ')) != NULL)
*cp = '\0';
if ((cp = strchr(tmp, '\t')) != NULL)
*cp = '\0';
translate(tmp, buf);
if (stat(buf, &st_buf) != 0)
continue;
if (S_ISBLK(st_buf.st_mode)) {
if (dev == st_buf.st_rdev) {
ret = 1;
break;
}
} else if (S_ISREG(st_buf.st_mode)) {
if (dev == st_buf.st_dev && ino == st_buf.st_ino) {
ret = 1;
break;
}
}
}
out:
fclose(f);
return ret;
}
/*
* Check for signature at the offset 0 that would be present in case of zoned
* device. Workaround for old blkid that do not recognize the format to avoid
* accidental overwrites.
*/
static int check_btrfs_signature_zoned(const char *device)
{
int fd;
int ret;
struct btrfs_super_block sb;
fd = open(device, O_RDONLY);
if (fd < 0)
return -1;
ret = pread(fd, &sb, BTRFS_SUPER_INFO_SIZE, 0);
if (ret < 0) {
ret = -1;
goto out;
}
if (btrfs_super_magic(&sb) == BTRFS_MAGIC)
ret = 1;
else
ret = 0;
out:
close(fd);
return ret;
}
/*
* Check for existing filesystem or partition table on device.
* Returns:
* 1 for existing fs or partition
* 0 for nothing found
* -1 for internal error
*/
static int check_overwrite(const char *device)
{
const char *type;
blkid_probe pr = NULL;
int ret;
blkid_loff_t size;
if (!device || !*device)
return 0;
ret = -1; /* will reset on success of all setup calls */
pr = blkid_new_probe_from_filename(device);
if (!pr)
goto out;
size = blkid_probe_get_size(pr);
if (size < 0)
goto out;
/* nothing to overwrite on a 0-length device */
if (size == 0) {
ret = 0;
goto out;
}
ret = blkid_probe_enable_partitions(pr, 1);
if (ret < 0)
goto out;
ret = blkid_do_fullprobe(pr);
if (ret < 0)
goto out;
/*
* Blkid returns 1 for nothing found and 0 when it finds a signature,
* but we want the exact opposite, so reverse the return value here.
*
* In addition print some useful diagnostics about what actually is
* on the device.
*/
if (ret) {
ret = 0;
goto out;
}
if (!blkid_probe_lookup_value(pr, "TYPE", &type, NULL)) {
error("%s appears to contain an existing filesystem (%s)", device, type);
} else if (!blkid_probe_lookup_value(pr, "PTTYPE", &type, NULL)) {
error("%s appears to contain a partition table (%s)", device, type);
} else {
error("%s appears to contain something weird according to blkid", device);
}
ret = 1;
out:
if (pr)
blkid_free_probe(pr);
if (ret == -1)
error("probe of %s failed, cannot detect existing filesystem", device);
/* Either nothing found or there was an error is a reason to double check */
if (ret == 0 || ret == -1) {
ret = check_btrfs_signature_zoned(device);
if (ret > 0) {
warning(
"%s contains zoned btrfs signature but was not detected by blkid, please update",
device);
ret = 1;
} else if (ret < 0) {
warning(
"cannot read superblock on %s, please check manually\n",
device);
ret = -1;
}
}
return ret;
}
/*
* Check if a device is suitable for btrfs
* returns:
* 1: something is wrong, an error is printed
* 0: all is fine
*/
bool test_dev_for_mkfs(const char *file, int force_overwrite)
{
int ret, fd;
struct stat st;
ret = is_swap_device(file);
if (ret < 0) {
errno = -ret;
error("checking status of %s: %m", file);
return true;
}
if (ret == 1) {
error("%s is a swap device", file);
return true;
}
ret = test_status_for_mkfs(file, force_overwrite);
if (ret)
return true;
/*
* Check if the device is busy. Open it in read-only mode to avoid triggering
* udev events.
*/
fd = open(file, O_RDONLY | O_EXCL);
if (fd < 0) {
error("unable to open %s: %m", file);
return true;
}
if (fstat(fd, &st)) {
error("unable to stat %s: %m", file);
close(fd);
return true;
}
if (!S_ISBLK(st.st_mode)) {
error("%s is not a block device", file);
close(fd);
return true;
}
close(fd);
return false;
}
/*
* check if the file (device) is formatted or mounted
*/
bool test_status_for_mkfs(const char *file, bool force_overwrite)
{
int ret;
if (!force_overwrite) {
if (check_overwrite(file)) {
error("use the -f option to force overwrite of %s",
file);
return true;
}
}
ret = check_mounted(file);
if (ret < 0) {
errno = -ret;
error("cannot check mount status of %s: %m", file);
return true;
}
if (ret == 1) {
error("%s is mounted", file);
return true;
}
return false;
}
int is_vol_small(const char *file)
{
int fd = -1;
int e;
struct stat st;
u64 size;
fd = open(file, O_RDONLY);
if (fd < 0)
return -errno;
if (fstat(fd, &st) < 0) {
e = -errno;
close(fd);
return e;
}
size = device_get_partition_size_fd_stat(fd, &st);
if (size == 0) {
close(fd);
return -1;
}
if (size < BTRFS_MKFS_SMALL_VOLUME_SIZE) {
close(fd);
return 1;
} else {
close(fd);
return 0;
}
}
int test_minimum_size(const char *file, u64 min_dev_size)
{
int fd;
struct stat statbuf;
fd = open(file, O_RDONLY);
if (fd < 0)
return -errno;
if (stat(file, &statbuf) < 0) {
close(fd);
return -errno;
}
if (device_get_partition_size_fd_stat(fd, &statbuf) < min_dev_size) {
close(fd);
return 1;
}
close(fd);
return 0;
}