btrfs-progs/kernel-shared/zoned.c

1223 lines
30 KiB
C
Raw Permalink Normal View History

// SPDX-License-Identifier: GPL-2.0
/*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License v2 as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this program; if not, write to the
* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
* Boston, MA 021110-1307, USA.
*/
#include "kerncompat.h"
#include <sys/ioctl.h>
#include <sys/stat.h>
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
#include <unistd.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include "kernel-lib/list.h"
#include "kernel-shared/volumes.h"
#include "kernel-shared/zoned.h"
#include "kernel-shared/accessors.h"
#include "kernel-shared/ctree.h"
#include "kernel-shared/extent_io.h"
#include "kernel-shared/uapi/btrfs.h"
#include "kernel-shared/uapi/btrfs_tree.h"
#include "common/utils.h"
#include "common/device-utils.h"
#include "common/extent-cache.h"
#include "common/internal.h"
#include "common/string-utils.h"
#include "common/messages.h"
#include "mkfs/common.h"
/* Maximum number of zones to report per ioctl(BLKREPORTZONE) call */
#define BTRFS_REPORT_NR_ZONES 4096
/* Invalid allocation pointer value for missing devices */
#define WP_MISSING_DEV ((u64)-1)
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL ((u64)-2)
#define DEFAULT_EMULATED_ZONE_SIZE SZ_256M
static u64 emulated_zone_size = DEFAULT_EMULATED_ZONE_SIZE;
/*
* Minimum / maximum supported zone size. Currently, SMR disks have a zone size
* of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do
* not expect the zone size to become larger than 8GiB or smaller than 4MiB in
* the near future.
*/
#define BTRFS_MAX_ZONE_SIZE (8ULL * SZ_1G)
#define BTRFS_MIN_ZONE_SIZE (SZ_4M)
static int btrfs_get_dev_zone_info(struct btrfs_device *device);
enum btrfs_zoned_model zoned_model(const char *file)
{
const char host_aware[] = "host-aware";
const char host_managed[] = "host-managed";
struct stat st;
char model[32];
int ret;
ret = stat(file, &st);
if (ret < 0) {
error("zoned: unable to stat %s", file);
return -ENOENT;
}
/* Consider a regular file as non-zoned device */
if (!S_ISBLK(st.st_mode))
return ZONED_NONE;
ret = device_get_queue_param(file, "zoned", model, sizeof(model));
if (ret <= 0)
return ZONED_NONE;
if (strncmp(model, host_aware, strlen(host_aware)) == 0)
return ZONED_HOST_AWARE;
if (strncmp(model, host_managed, strlen(host_managed)) == 0)
return ZONED_HOST_MANAGED;
return ZONED_NONE;
}
u64 zone_size(const char *file)
{
char chunk[32];
int ret;
/* Zoned emulation on regular device */
if (zoned_model(file) == ZONED_NONE) {
const char *tmp;
u64 size = DEFAULT_EMULATED_ZONE_SIZE;
tmp = bconf_param_value("zone-size");
if (tmp) {
size = arg_strtou64_with_suffix(tmp);
if (!is_power_of_2(size) || size < BTRFS_MIN_ZONE_SIZE ||
size > BTRFS_MAX_ZONE_SIZE) {
error("invalid emulated zone size %llu", size);
exit(1);
}
}
emulated_zone_size = size;
return emulated_zone_size;
}
ret = device_get_queue_param(file, "chunk_sectors", chunk, sizeof(chunk));
if (ret <= 0)
return 0;
return strtoull((const char *)chunk, NULL, 10) << SECTOR_SHIFT;
}
static u64 max_zone_append_size(const char *file)
{
char chunk[32];
int ret;
ret = device_get_queue_param(file, "zone_append_max_bytes", chunk,
sizeof(chunk));
if (ret <= 0)
return 0;
return strtoull((const char *)chunk, NULL, 10);
}
#ifdef BTRFS_ZONED
/*
* Emulate blkdev_report_zones() for a non-zoned device. It slices up the block
* device into fixed-sized chunks and emulate a conventional zone on each of
* them.
*/
static int emulate_report_zones(const char *file, int fd, u64 pos,
struct blk_zone *zones, unsigned int nr_zones)
{
const sector_t zone_sectors = emulated_zone_size >> SECTOR_SHIFT;
struct stat st;
sector_t bdev_size;
unsigned int i;
int ret;
ret = fstat(fd, &st);
if (ret < 0) {
error("unable to stat %s: %m", file);
return -EIO;
}
bdev_size = device_get_partition_size_fd_stat(fd, &st) >> SECTOR_SHIFT;
pos >>= SECTOR_SHIFT;
for (i = 0; i < nr_zones; i++) {
zones[i].start = i * zone_sectors + pos;
zones[i].len = zone_sectors;
zones[i].capacity = zone_sectors;
zones[i].wp = zones[i].start + zone_sectors;
zones[i].type = BLK_ZONE_TYPE_CONVENTIONAL;
zones[i].cond = BLK_ZONE_COND_NOT_WP;
if (zones[i].wp >= bdev_size) {
i++;
break;
}
}
return i;
}
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
static int sb_write_pointer(int fd, struct blk_zone *zones, u64 *wp_ret)
{
bool empty[BTRFS_NR_SB_LOG_ZONES];
bool full[BTRFS_NR_SB_LOG_ZONES];
sector_t sector;
ASSERT(zones[0].type != BLK_ZONE_TYPE_CONVENTIONAL &&
zones[1].type != BLK_ZONE_TYPE_CONVENTIONAL);
empty[0] = (zones[0].cond == BLK_ZONE_COND_EMPTY);
empty[1] = (zones[1].cond == BLK_ZONE_COND_EMPTY);
full[0] = (zones[0].cond == BLK_ZONE_COND_FULL);
full[1] = (zones[1].cond == BLK_ZONE_COND_FULL);
/*
* Possible states of log buffer zones
*
* Empty[0] In use[0] Full[0]
* Empty[1] * x 0
* In use[1] 0 x 0
* Full[1] 1 1 C
*
* Log position:
* *: Special case, no superblock is written
* 0: Use write pointer of zones[0]
* 1: Use write pointer of zones[1]
* C: Compare super blocks from zones[0] and zones[1], use the latest
* one determined by generation
* x: Invalid state
*/
if (empty[0] && empty[1]) {
/* Special case to distinguish no superblock to read */
*wp_ret = (zones[0].start << SECTOR_SHIFT);
return -ENOENT;
} else if (full[0] && full[1]) {
/* Compare two super blocks */
u8 buf[BTRFS_NR_SB_LOG_ZONES][BTRFS_SUPER_INFO_SIZE];
struct btrfs_super_block *super[BTRFS_NR_SB_LOG_ZONES];
int i;
int ret;
for (i = 0; i < BTRFS_NR_SB_LOG_ZONES; i++) {
u64 bytenr;
bytenr = ((zones[i].start + zones[i].len)
<< SECTOR_SHIFT) - BTRFS_SUPER_INFO_SIZE;
btrfs-progs: stop using legacy *64 interfaces The *64 interfaces, such as fstat64, off64_t, etc, are legacy interfaces created at a time when 64-bit file support was still new. They are generally exposed when defining a macro named _LARGEFILE64_SOURCE, as e.g. the glibc docs[0] say. The modern way to utilise largefile support, is to continue to use the regular interfaces (off_t, fstat, ..), and define _FILE_OFFSET_BITS=64. We already use the autoconf macro AC_SYS_LARGEFILE[1] which arranges this and sets this macro for us. Therefore, we can utilise the non-64 names without fear of breaking on 32-bit systems. This fixes the build against musl libc, ever since musl dropped the *64 compat from interfaces by default[2] just for _GNU_SOURCE, unless _LARGEFILE64_SOURCE is defined. However, there are plans for a future removal of the whole *64 header API, and that workaround (adding another define) might cease to exist. So, rename all *64 API use to the regular non-suffixed names. For consistency, rename the internal functions that were *64 named (lstat64_path, ..) too. This should have no regressions on any platform. [0]: https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html#index-_005fLARGEFILE64_005fSOURCE [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.67/html_node/System-Services.html [2]: https://github.com/bminor/musl/commit/25e6fee27f4a293728dd15b659170e7b9c7db9bc Pull-request: #615 Signed-off-by: psykose <alice@ayaya.dev> Signed-off-by: David Sterba <dsterba@suse.com>
2023-04-15 19:15:49 +02:00
ret = pread(fd, buf[i], BTRFS_SUPER_INFO_SIZE, bytenr);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
if (ret != BTRFS_SUPER_INFO_SIZE)
return -EIO;
super[i] = (struct btrfs_super_block *)&buf[i];
}
if (super[0]->generation > super[1]->generation)
sector = zones[1].start;
else
sector = zones[0].start;
} else if (!full[0] && (empty[1] || full[1])) {
sector = zones[0].wp;
} else if (full[0]) {
sector = zones[1].wp;
} else {
return -EUCLEAN;
}
*wp_ret = sector << SECTOR_SHIFT;
return 0;
}
int btrfs_reset_dev_zone(int fd, struct blk_zone *zone)
{
struct blk_zone_range range;
/* Nothing to do if it is already empty */
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
zone->cond == BLK_ZONE_COND_EMPTY)
return 0;
range.sector = zone->start;
range.nr_sectors = zone->len;
if (ioctl(fd, BLKRESETZONE, &range) < 0)
return -errno;
zone->cond = BLK_ZONE_COND_EMPTY;
zone->wp = zone->start;
return 0;
}
static int report_zones(int fd, const char *file,
struct btrfs_zoned_device_info *zinfo)
{
u64 device_size;
u64 zone_bytes = zone_size(file);
size_t rep_size;
u64 sector = 0;
struct stat st;
struct blk_zone_report *rep;
struct blk_zone *zone;
unsigned int i, n = 0;
int ret;
/*
* Zones are guaranteed (by kernel) to be a power of 2 number of
* sectors. Check this here and make sure that zones are not too small.
*/
if (!zone_bytes || !is_power_of_2(zone_bytes)) {
error("zoned: illegal zone size %llu (not a power of 2)",
zone_bytes);
exit(1);
}
/*
* The zone size must be large enough to hold the initial system
* block group for mkfs time.
*/
if (zone_bytes < BTRFS_MKFS_SYSTEM_GROUP_SIZE) {
error("zoned: illegal zone size %llu (smaller than %d)",
zone_bytes, BTRFS_MKFS_SYSTEM_GROUP_SIZE);
exit(1);
}
ret = fstat(fd, &st);
if (ret < 0) {
error("error when reading zone info on %s: %m", file);
return -EIO;
}
device_size = device_get_partition_size_fd_stat(fd, &st);
if (device_size == 0) {
error("zoned: failed to read size of %s: %m", file);
exit(1);
}
/* Allocate the zone information array */
zinfo->zone_size = zone_bytes;
zinfo->nr_zones = device_size / zone_bytes;
if (zinfo->zone_size > BTRFS_MAX_ZONE_SIZE) {
error("zoned: zone size %llu larger than supported maximum %llu",
zinfo->zone_size, BTRFS_MAX_ZONE_SIZE);
exit(1);
} else if (zinfo->zone_size < BTRFS_MIN_ZONE_SIZE) {
error("zoned: zone size %llu smaller than supported minimum %u",
zinfo->zone_size, BTRFS_MIN_ZONE_SIZE);
exit(1);
}
if (device_size & (zone_bytes - 1))
zinfo->nr_zones++;
if (zoned_model(file) != ZONED_NONE && max_zone_append_size(file) == 0) {
error(
"zoned: device %s does not support ZONE_APPEND command", file);
exit(1);
}
zinfo->zones = calloc(zinfo->nr_zones, sizeof(struct blk_zone));
if (!zinfo->zones) {
error_msg(ERROR_MSG_MEMORY, "zone information");
exit(1);
}
/* Allocate a zone report */
rep_size = sizeof(struct blk_zone_report) +
sizeof(struct blk_zone) * BTRFS_REPORT_NR_ZONES;
rep = kmalloc(rep_size, GFP_KERNEL);
if (!rep) {
error_msg(ERROR_MSG_MEMORY, "zone report");
exit(1);
}
/* Get zone information */
zone = (struct blk_zone *)(rep + 1);
while (n < zinfo->nr_zones) {
memset(rep, 0, rep_size);
rep->sector = sector;
rep->nr_zones = BTRFS_REPORT_NR_ZONES;
if (zinfo->model != ZONED_NONE) {
ret = ioctl(fd, BLKREPORTZONE, rep);
if (ret != 0) {
error("zoned: ioctl BLKREPORTZONE failed (%m)");
exit(1);
}
zinfo->emulated = false;
} else {
ret = emulate_report_zones(file, fd,
sector << SECTOR_SHIFT,
zone, BTRFS_REPORT_NR_ZONES);
if (ret < 0) {
error("zoned: failed to emulate BLKREPORTZONE");
exit(1);
}
zinfo->emulated = true;
}
if (!rep->nr_zones)
break;
for (i = 0; i < rep->nr_zones; i++) {
if (n >= zinfo->nr_zones)
break;
memcpy(&zinfo->zones[n], &zone[i],
sizeof(struct blk_zone));
n++;
}
sector = zone[rep->nr_zones - 1].start +
zone[rep->nr_zones - 1].len;
}
kfree(rep);
return 0;
}
/*
* Discard blocks in the zones of a zoned block device. Process this with zone
* size granularity so that blocks in conventional zones are discarded using
* discard_range and blocks in sequential zones are reset though a zone reset.
*
* We need to ensure that zones outside of the fs are not active, so that the fs
* can use all the active zones. Return EBUSY if there is an active zone.
*/
int btrfs_reset_zones(int fd, struct btrfs_zoned_device_info *zinfo, u64 byte_count)
{
unsigned int i;
int ret = 0;
ASSERT(zinfo);
ASSERT(IS_ALIGNED(byte_count, zinfo->zone_size));
/* Zone size granularity */
for (i = 0; i < zinfo->nr_zones; i++) {
if (byte_count == 0)
break;
if (zinfo->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL) {
ret = device_discard_blocks(fd,
zinfo->zones[i].start << SECTOR_SHIFT,
zinfo->zone_size);
if (ret == EOPNOTSUPP)
ret = 0;
} else if (zinfo->zones[i].cond != BLK_ZONE_COND_EMPTY) {
ret = btrfs_reset_dev_zone(fd, &zinfo->zones[i]);
} else {
ret = 0;
}
if (ret)
return ret;
byte_count -= zinfo->zone_size;
}
for (; i < zinfo->nr_zones; i++) {
const enum blk_zone_cond cond = zinfo->zones[i].cond;
if (zinfo->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
continue;
if (cond == BLK_ZONE_COND_IMP_OPEN ||
cond == BLK_ZONE_COND_EXP_OPEN ||
cond == BLK_ZONE_COND_CLOSED)
return EBUSY;
}
return fsync(fd);
}
int zero_zone_blocks(int fd, struct btrfs_zoned_device_info *zinfo, off_t start,
size_t len)
{
size_t zone_len = zinfo->zone_size;
off_t ofst = start;
size_t count;
int ret;
/* Make sure that device_zero_blocks does not write sequential zones */
while (len > 0) {
/* Limit device_zero_blocks to a single zone */
count = min_t(size_t, len, zone_len);
if (count > zone_len - (ofst & (zone_len - 1)))
count = zone_len - (ofst & (zone_len - 1));
if (!zone_is_sequential(zinfo, ofst)) {
ret = device_zero_blocks(fd, ofst, count, true);
if (ret != 0)
return ret;
}
len -= count;
ofst += count;
}
return 0;
}
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
static int sb_log_location(int fd, struct blk_zone *zones, int rw, u64 *bytenr_ret)
{
u64 wp;
int ret;
/* Use the head of the zones if either zone is conventional */
if (zones[0].type == BLK_ZONE_TYPE_CONVENTIONAL) {
*bytenr_ret = zones[0].start << SECTOR_SHIFT;
return 0;
} else if (zones[1].type == BLK_ZONE_TYPE_CONVENTIONAL) {
*bytenr_ret = zones[1].start << SECTOR_SHIFT;
return 0;
}
ret = sb_write_pointer(fd, zones, &wp);
if (ret != -ENOENT && ret < 0)
return ret;
if (rw == WRITE) {
struct blk_zone *reset = NULL;
if (wp == zones[0].start << SECTOR_SHIFT)
reset = &zones[0];
else if (wp == zones[1].start << SECTOR_SHIFT)
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
ASSERT(reset->cond == BLK_ZONE_COND_FULL);
ret = btrfs_reset_dev_zone(fd, reset);
if (ret)
return ret;
}
} else if (ret != -ENOENT) {
/* For READ, we want the previous one */
if (wp == zones[0].start << SECTOR_SHIFT)
wp = (zones[1].start + zones[1].len) << SECTOR_SHIFT;
wp -= BTRFS_SUPER_INFO_SIZE;
}
*bytenr_ret = wp;
return 0;
}
static u32 sb_bytenr_to_sb_zone(u64 bytenr, int zone_size_shift)
{
int mirror = -1;
for (int i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
if (bytenr == btrfs_sb_offset(i)) {
mirror = i;
break;
}
}
ASSERT(mirror != -1);
return sb_zone_number(zone_size_shift, mirror);
}
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
size_t btrfs_sb_io(int fd, void *buf, off_t offset, int rw)
{
size_t count = BTRFS_SUPER_INFO_SIZE;
struct stat stat_buf;
struct blk_zone_report *rep;
struct blk_zone *zones;
const u64 sb_size_sector = (BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT);
u64 mapped = U64_MAX;
u32 zone_num;
u32 zone_size_sector;
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
size_t rep_size;
int ret;
size_t ret_sz;
ASSERT(rw == READ || rw == WRITE);
if (fstat(fd, &stat_buf) == -1) {
error("fstat failed: %m");
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
exit(1);
}
/* Do not call ioctl(BLKGETZONESZ) on a regular file. */
if ((stat_buf.st_mode & S_IFMT) == S_IFBLK) {
ret = ioctl(fd, BLKGETZONESZ, &zone_size_sector);
if (ret < 0) {
if (errno == ENOTTY || errno == EINVAL) {
/*
* No kernel support, assuming non-zoned device.
*
* Note: older kernels before 5.11 could return
* EINVAL in case the ioctl is not available,
* which is wrong.
*/
zone_size_sector = 0;
} else {
error("zoned: ioctl BLKGETZONESZ failed: %m");
exit(1);
}
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
}
} else {
zone_size_sector = 0;
}
/* We can call pread/pwrite if 'fd' is non-zoned device/file */
if (zone_size_sector == 0) {
if (rw == READ)
btrfs-progs: stop using legacy *64 interfaces The *64 interfaces, such as fstat64, off64_t, etc, are legacy interfaces created at a time when 64-bit file support was still new. They are generally exposed when defining a macro named _LARGEFILE64_SOURCE, as e.g. the glibc docs[0] say. The modern way to utilise largefile support, is to continue to use the regular interfaces (off_t, fstat, ..), and define _FILE_OFFSET_BITS=64. We already use the autoconf macro AC_SYS_LARGEFILE[1] which arranges this and sets this macro for us. Therefore, we can utilise the non-64 names without fear of breaking on 32-bit systems. This fixes the build against musl libc, ever since musl dropped the *64 compat from interfaces by default[2] just for _GNU_SOURCE, unless _LARGEFILE64_SOURCE is defined. However, there are plans for a future removal of the whole *64 header API, and that workaround (adding another define) might cease to exist. So, rename all *64 API use to the regular non-suffixed names. For consistency, rename the internal functions that were *64 named (lstat64_path, ..) too. This should have no regressions on any platform. [0]: https://www.gnu.org/software/libc/manual/html_node/Feature-Test-Macros.html#index-_005fLARGEFILE64_005fSOURCE [1]: https://www.gnu.org/software/autoconf/manual/autoconf-2.67/html_node/System-Services.html [2]: https://github.com/bminor/musl/commit/25e6fee27f4a293728dd15b659170e7b9c7db9bc Pull-request: #615 Signed-off-by: psykose <alice@ayaya.dev> Signed-off-by: David Sterba <dsterba@suse.com>
2023-04-15 19:15:49 +02:00
return pread(fd, buf, count, offset);
return pwrite(fd, buf, count, offset);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
}
ASSERT(IS_ALIGNED(zone_size_sector, sb_size_sector));
zone_num = sb_bytenr_to_sb_zone(offset, ilog2(zone_size_sector) + SECTOR_SHIFT);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
rep_size = sizeof(struct blk_zone_report) + sizeof(struct blk_zone) * 2;
rep = calloc(1, rep_size);
if (!rep) {
error_msg(ERROR_MSG_MEMORY, "zone report");
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
exit(1);
}
rep->sector = zone_num * (sector_t)zone_size_sector;
rep->nr_zones = 2;
ret = ioctl(fd, BLKREPORTZONE, rep);
if (ret) {
if (errno == ENOTTY || errno == EINVAL) {
/*
* Note: older kernels before 5.11 could return EINVAL
* in case the ioctl is not available, which is wrong.
*/
error("zoned: BLKREPORTZONE failed but BLKGETZONESZ works: %m");
exit(1);
}
error("zoned: ioctl BLKREPORTZONE failed: %m");
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
exit(1);
}
if (rep->nr_zones != 2) {
if (errno == ENOENT || errno == 0)
return (rw == WRITE ? count : 0);
error("zoned: failed to read zone info of %u and %u: %m",
zone_num, zone_num + 1);
kfree(rep);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
return 0;
}
zones = (struct blk_zone *)(rep + 1);
ret = sb_log_location(fd, zones, rw, &mapped);
kfree(rep);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
/*
* Special case: no superblock found in the zones. This case happens
* when initializing a file-system.
*/
if (rw == READ && ret == -ENOENT) {
memset(buf, 0, count);
return count;
}
if (ret)
return ret;
if (rw == READ)
ret_sz = btrfs_pread(fd, buf, count, mapped, true);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
else
ret_sz = btrfs_pwrite(fd, buf, count, mapped, true);
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
if (ret_sz != count)
return ret_sz;
/* Call fsync() to force the write order */
if (rw == WRITE && fsync(fd)) {
error("failed to synchronize superblock: %m");
btrfs-progs: zoned: implement log-structured superblock Superblock (and its copies) is the only data structure in btrfs which has a fixed location on a device. Since we cannot overwrite in a sequential write required zone, we cannot place superblock in the zone. One easy solution is limiting superblock and copies to be placed only in conventional zones. However, this method has two downsides: one is reduced number of superblock copies. The location of the second copy of superblock is 256GB, which is in a sequential write required zone on typical devices in the market today. So, the number of superblock and copies is limited to be two. Second downside is that we cannot support devices which have no conventional zones at all. To solve these two problems, we employ superblock log writing. It uses two adjacent zones as a circular buffer to write updated superblocks. Once the first zone is filled up, start writing into the second one. Then, when both zones are filled up and before starting to write to the first zone again, reset the first zone. We can determine the position of the latest superblock by reading write pointer information from a device. One corner case is when both zones are full. For this situation, we read out the last superblock of each zone, and compare them to determine which zone is older. The following zones are reserved as the circular buffer on ZONED btrfs. - primary superblock: offset 0B (and the following zone) - first copy: offset 512G (and the following zone) - Second copy: offset 4T (4096G, and the following zone) If these reserved zones are conventional, superblock is written fixed at the start of the zone without logging. Currently, superblock reading/writing is done by pread/pwrite. This commit replace the call sites with sbread/sbwrite to wrap the functions. For zoned btrfs, btrfs_sb_io which is called from sbread/sbwrite reverses the IO position back to a mirror number, maps the mirror number into the superblock logging position, and do the IO. Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com> Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com> Signed-off-by: David Sterba <dsterba@suse.com>
2021-04-26 08:27:26 +02:00
exit(1);
}
return ret_sz;
}
/**
* btrfs_find_allocatable_zones - find allocatable zones within a given region
*
* @device: the device to allocate a region on
* @hole_start: the position of the hole to allocate the region
* @num_bytes: size of wanted region
* @hole_end: the end of the hole
* @return: position of allocatable zones
*
* Allocatable region should not contain any superblock locations.
*/
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
u64 hole_end, u64 num_bytes)
{
struct btrfs_zoned_device_info *zinfo = device->zone_info;
int shift = ilog2(zinfo->zone_size);
u64 nzones = num_bytes >> shift;
u64 pos = hole_start;
u64 begin, end;
bool is_sequential;
bool have_sb;
int i;
ASSERT(IS_ALIGNED(hole_start, zinfo->zone_size));
ASSERT(IS_ALIGNED(num_bytes, zinfo->zone_size));
while (pos < hole_end) {
begin = pos >> shift;
end = begin + nzones;
if (end > zinfo->nr_zones)
return hole_end;
/*
* The zones must be all sequential (and empty), or
* conventional
*/
is_sequential = btrfs_dev_is_sequential(device, pos);
for (i = 0; i < end - begin; i++) {
u64 zone_offset = pos + ((u64)i << shift);
if ((is_sequential &&
!btrfs_dev_is_empty_zone(device, zone_offset)) ||
(is_sequential !=
btrfs_dev_is_sequential(device, zone_offset))) {
pos += zinfo->zone_size;
continue;
}
}
have_sb = false;
for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
u32 sb_zone;
u64 sb_pos;
sb_zone = sb_zone_number(shift, i);
if (!(end <= sb_zone ||
sb_zone + BTRFS_NR_SB_LOG_ZONES <= begin)) {
have_sb = true;
pos = ((u64)sb_zone + BTRFS_NR_SB_LOG_ZONES) << shift;
break;
}
/* We also need to exclude regular superblock positions */
sb_pos = btrfs_sb_offset(i);
if (!(pos + num_bytes <= sb_pos ||
sb_pos + BTRFS_SUPER_INFO_SIZE <= pos)) {
have_sb = true;
pos = ALIGN(sb_pos + BTRFS_SUPER_INFO_SIZE,
zinfo->zone_size);
break;
}
}
if (!have_sb)
break;
}
return pos;
}
/*
* Calculate an allocation pointer from the extent allocation information
* for a block group consisting of conventional zones. It is pointed to the
* end of the highest addressed extent in the block group as an allocation
* offset.
*/
static int calculate_alloc_pointer(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *cache,
u64 *offset_ret)
{
struct btrfs_root *root = btrfs_extent_root(fs_info, cache->start);
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
int ret;
u64 length;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = cache->start + cache->length;
key.type = 0;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
/* There should be no exact match (ie. an extent) at this address */
if (!ret)
ret = -EUCLEAN;
if (ret < 0)
goto out;
ret = btrfs_previous_extent_item(root, path, cache->start);
if (ret) {
if (ret == 1) {
ret = 0;
*offset_ret = 0;
}
goto out;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
length = found_key.offset;
else
length = fs_info->nodesize;
if (!(found_key.objectid >= cache->start &&
found_key.objectid + length <= cache->start + cache->length)) {
ret = -EUCLEAN;
goto out;
}
*offset_ret = found_key.objectid + length - cache->start;
ret = 0;
out:
btrfs_free_path(path);
return ret;
}
bool zoned_profile_supported(u64 map_type, bool rst)
{
bool data = (map_type & BTRFS_BLOCK_GROUP_DATA);
u64 flags = (map_type & BTRFS_BLOCK_GROUP_PROFILE_MASK);
/* SINGLE */
if (flags == 0)
return true;
if (data) {
if ((flags & BTRFS_BLOCK_GROUP_DUP) && rst)
return true;
/* Data RAID1 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID1_MASK) && rst)
return true;
/* Data RAID0 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID0) && rst)
return true;
/* Data RAID10 needs a raid-stripe-tree. */
if ((flags & BTRFS_BLOCK_GROUP_RAID10) && rst)
return true;
} else {
/* We can support DUP on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_DUP)
return true;
/* We can support RAID1 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID1_MASK)
return true;
/* We can support RAID0 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID0)
return true;
/* We can support RAID10 on metadata/system. */
if (flags & BTRFS_BLOCK_GROUP_RAID10)
return true;
}
/* All other profiles are not supported yet */
return false;
}
int btrfs_load_block_group_zone_info(struct btrfs_fs_info *fs_info,
struct btrfs_block_group *cache)
{
struct btrfs_device *device;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
struct cache_extent *ce;
struct map_lookup *map;
u64 logical = cache->start;
u64 length = cache->length;
u64 physical = 0;
int ret = 0;
int i;
u64 *alloc_offsets = NULL;
u64 last_alloc = 0;
u32 num_conventional = 0;
if (!btrfs_is_zoned(fs_info))
return 0;
/* Sanity check */
if (logical == BTRFS_BLOCK_RESERVED_1M_FOR_SUPER) {
if (length + SZ_1M != fs_info->zone_size) {
error("zoned: unaligned initial system block group");
return -EIO;
}
} else if (!IS_ALIGNED(length, fs_info->zone_size)) {
error("zoned: unaligned block group at %llu + %llu", logical,
length);
return -EIO;
}
/* Get the chunk mapping */
ce = search_cache_extent(&map_tree->cache_tree, logical);
if (!ce) {
error("zoned: failed to find block group at %llu", logical);
return -ENOENT;
}
map = container_of(ce, struct map_lookup, ce);
alloc_offsets = calloc(map->num_stripes, sizeof(*alloc_offsets));
if (!alloc_offsets) {
error_msg(ERROR_MSG_MEMORY, "zone offsets");
return -ENOMEM;
}
for (i = 0; i < map->num_stripes; i++) {
bool is_sequential;
struct blk_zone zone;
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
if (device->fd == -1) {
alloc_offsets[i] = WP_MISSING_DEV;
continue;
}
is_sequential = btrfs_dev_is_sequential(device, physical);
if (!is_sequential)
num_conventional++;
if (!is_sequential) {
alloc_offsets[i] = WP_CONVENTIONAL;
continue;
}
/*
* The group is mapped to a sequential zone. Get the zone write
* pointer to determine the allocation offset within the zone.
*/
WARN_ON(!IS_ALIGNED(physical, fs_info->zone_size));
zone = device->zone_info->zones[physical / fs_info->zone_size];
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
error(
"zoned: offline/readonly zone %llu on device %s (devid %llu)",
physical / fs_info->zone_size, device->name,
device->devid);
alloc_offsets[i] = WP_MISSING_DEV;
break;
case BLK_ZONE_COND_EMPTY:
alloc_offsets[i] = 0;
break;
case BLK_ZONE_COND_FULL:
alloc_offsets[i] = fs_info->zone_size;
break;
default:
/* Partially used zone */
alloc_offsets[i] =
((zone.wp - zone.start) << SECTOR_SHIFT);
break;
}
}
if (num_conventional > 0) {
ret = calculate_alloc_pointer(fs_info, cache, &last_alloc);
if (ret || map->num_stripes == num_conventional) {
if (!ret)
cache->alloc_offset = last_alloc;
else
error(
"zoned: failed to determine allocation offset of block group %llu",
cache->start);
goto out;
}
}
if (!zoned_profile_supported(map->type, !!fs_info->stripe_root)) {
error("zoned: profile %s not yet supported",
btrfs_group_profile_str(map->type));
ret = -EINVAL;
goto out;
}
cache->alloc_offset = alloc_offsets[0];
out:
/* An extent is allocated after the write pointer */
if (!ret && num_conventional && last_alloc > cache->alloc_offset) {
error(
"zoned: got wrong write pointer in block group %llu: %llu > %llu",
logical, last_alloc, cache->alloc_offset);
ret = -EIO;
}
if (!ret)
cache->write_offset = cache->alloc_offset;
kfree(alloc_offsets);
return ret;
}
bool btrfs_redirty_extent_buffer_for_zoned(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
u64 next;
struct btrfs_block_group *cache;
struct extent_buffer *eb;
if (!btrfs_is_zoned(fs_info))
return false;
cache = btrfs_lookup_first_block_group(fs_info, start);
BUG_ON(!cache);
if (cache->start + cache->write_offset < start) {
next = cache->start + cache->write_offset;
BUG_ON(next + fs_info->nodesize > start);
eb = btrfs_find_create_tree_block(fs_info, next);
btrfs_mark_buffer_dirty(eb);
free_extent_buffer(eb);
return true;
}
cache->write_offset += (end + 1 - start);
return false;
}
int btrfs_reset_chunk_zones(struct btrfs_fs_info *fs_info, u64 devid,
u64 offset, u64 length)
{
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
struct btrfs_zoned_device_info *zinfo;
struct blk_zone *reset;
if (device->devid != devid)
continue;
zinfo = device->zone_info;
if (!zone_is_sequential(zinfo, offset))
continue;
reset = &zinfo->zones[offset / zinfo->zone_size];
if (btrfs_reset_dev_zone(device->fd, reset)) {
error("zoned: failed to reset zone %llu: %m",
offset / zinfo->zone_size);
return -EIO;
}
}
return 0;
}
int btrfs_wipe_temporary_sb(struct btrfs_fs_devices *fs_devices)
{
struct list_head *head = &fs_devices->devices;
struct btrfs_device *dev;
int ret = 0;
list_for_each_entry(dev, head, dev_list) {
struct btrfs_zoned_device_info *zinfo = dev->zone_info;
if (!zinfo)
continue;
ret = btrfs_reset_dev_zone(dev->fd, &zinfo->zones[0]);
if (ret)
break;
}
return ret;
}
bool btrfs_sb_zone_exists(struct btrfs_device *device, u64 bytenr)
{
u32 zone_num = sb_bytenr_to_sb_zone(bytenr,
ilog2(device->zone_info->zone_size));
return zone_num + 1 <= device->zone_info->nr_zones - 1;
}
#endif
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
int ret = 0;
/* fs_info->zone_size might not set yet. Use the incomapt flag here. */
if (!btrfs_fs_incompat(fs_info, ZONED))
return 0;
list_for_each_entry(device, &fs_devices->devices, dev_list) {
/* We can skip reading of zone info for missing devices */
if (device->fd == -1)
continue;
ret = btrfs_get_dev_zone_info(device);
if (ret)
break;
}
return ret;
}
static int btrfs_get_dev_zone_info(struct btrfs_device *device)
{
struct btrfs_fs_info *fs_info = device->fs_info;
/*
* Cannot use btrfs_is_zoned here, since fs_info::zone_size might not
* yet be set.
*/
if (!btrfs_fs_incompat(fs_info, ZONED))
return 0;
if (device->zone_info)
return 0;
return btrfs_get_zone_info(device->fd, device->name, &device->zone_info);
}
int btrfs_get_zone_info(int fd, const char *file,
struct btrfs_zoned_device_info **zinfo_ret)
{
#ifdef BTRFS_ZONED
struct btrfs_zoned_device_info *zinfo;
int ret;
#endif
enum btrfs_zoned_model model;
*zinfo_ret = NULL;
/* Check zone model */
model = zoned_model(file);
#ifdef BTRFS_ZONED
zinfo = calloc(1, sizeof(*zinfo));
if (!zinfo) {
error_msg(ERROR_MSG_MEMORY, "zone information");
exit(1);
}
zinfo->model = model;
/* Get zone information */
ret = report_zones(fd, file, zinfo);
if (ret != 0) {
kfree(zinfo);
return ret;
}
*zinfo_ret = zinfo;
#else
error("zoned: %s: unsupported host-%s zoned block device", file,
model == ZONED_HOST_MANAGED ? "managed" : "aware");
if (model == ZONED_HOST_MANAGED)
return -EOPNOTSUPP;
error("zoned: %s: handling host-aware block device as a regular disk",
file);
#endif
return 0;
}
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 zoned_devices = 0;
u64 nr_devices = 0;
u64 zone_size = 0;
const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED);
int ret = 0;
/* Count zoned devices */
list_for_each_entry(device, &fs_devices->devices, dev_list) {
enum btrfs_zoned_model model;
if (device->fd == -1)
continue;
model = zoned_model(device->name);
/*
* A Host-Managed zoned device must be used as a zoned device.
* A Host-Aware zoned device and a non-zoned devices can be
* treated as a zoned device, if ZONED flag is enabled in the
* superblock.
*/
if (model == ZONED_HOST_MANAGED ||
(model == ZONED_HOST_AWARE && incompat_zoned) ||
(model == ZONED_NONE && incompat_zoned)) {
struct btrfs_zoned_device_info *zone_info =
device->zone_info;
zoned_devices++;
if (!zone_size) {
zone_size = zone_info->zone_size;
} else if (zone_info->zone_size != zone_size) {
error(
"zoned: unequal block device zone sizes: have %llu found %llu",
device->zone_info->zone_size,
zone_size);
ret = -EINVAL;
goto out;
}
}
nr_devices++;
}
if (!zoned_devices && !incompat_zoned)
goto out;
if (!zoned_devices && incompat_zoned) {
/* No zoned block device found on ZONED filesystem */
error("zoned: no zoned devices found on a zoned filesystem");
ret = -EINVAL;
goto out;
}
if (zoned_devices && !incompat_zoned) {
error("zoned: mode not enabled but zoned device found");
ret = -EINVAL;
goto out;
}
if (zoned_devices != nr_devices) {
error("zoned: cannot mix zoned and regular devices");
ret = -EINVAL;
goto out;
}
/*
* stripe_size is always aligned to BTRFS_STRIPE_LEN in
* __btrfs_alloc_chunk(). Since we want stripe_len == zone_size,
* check the alignment here.
*/
if (!IS_ALIGNED(zone_size, BTRFS_STRIPE_LEN)) {
error("zoned: zone size %llu not aligned to stripe %u",
zone_size, BTRFS_STRIPE_LEN);
ret = -EINVAL;
goto out;
}
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
error("zoned: mixed block groups not supported");
ret = -EINVAL;
goto out;
}
fs_info->zone_size = zone_size;
fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED;
out:
return ret;
}