[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
[Qemu-devel] [PULL 06/69] sheepdog: selectable object size support
From: |
Stefan Hajnoczi |
Subject: |
[Qemu-devel] [PULL 06/69] sheepdog: selectable object size support |
Date: |
Fri, 27 Feb 2015 18:18:04 +0000 |
From: Teruaki Ishizaki <address@hidden>
Previously, qemu block driver of sheepdog used hard-coded VDI object size.
This patch enables users to handle VDI object size.
When you start qemu, you don't need to specify additional command option.
But when you create the VDI which doesn't have default object size
with qemu-img command, you specify object_size option.
If you want to create a VDI of 8MB object size,
you need to specify following command option.
# qemu-img create -o object_size=8M sheepdog:test1 100M
In addition, when you don't specify qemu-img command option,
a default value of sheepdog cluster is used for creating VDI.
# qemu-img create sheepdog:test2 100M
Signed-off-by: Teruaki Ishizaki <address@hidden>
Acked-by: Hitoshi Mitake <address@hidden>
Signed-off-by: Kevin Wolf <address@hidden>
---
block/sheepdog.c | 155 +++++++++++++++++++++++++++++++++++++++-------
include/block/block_int.h | 1 +
2 files changed, 134 insertions(+), 22 deletions(-)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index d17ee36..a2679c2 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -37,6 +37,7 @@
#define SD_OP_READ_VDIS 0x15
#define SD_OP_FLUSH_VDI 0x16
#define SD_OP_DEL_VDI 0x17
+#define SD_OP_GET_CLUSTER_DEFAULT 0x18
#define SD_FLAG_CMD_WRITE 0x01
#define SD_FLAG_CMD_COW 0x02
@@ -91,6 +92,7 @@
#define SD_NR_VDIS (1U << 24)
#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22)
#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS)
+#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22
/*
* For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and
* (SD_EC_MAX_STRIP - 1) for parity strips
@@ -167,7 +169,8 @@ typedef struct SheepdogVdiReq {
uint32_t base_vdi_id;
uint8_t copies;
uint8_t copy_policy;
- uint8_t reserved[2];
+ uint8_t store_policy;
+ uint8_t block_size_shift;
uint32_t snapid;
uint32_t type;
uint32_t pad[2];
@@ -186,6 +189,21 @@ typedef struct SheepdogVdiRsp {
uint32_t pad[5];
} SheepdogVdiRsp;
+typedef struct SheepdogClusterRsp {
+ uint8_t proto_ver;
+ uint8_t opcode;
+ uint16_t flags;
+ uint32_t epoch;
+ uint32_t id;
+ uint32_t data_length;
+ uint32_t result;
+ uint8_t nr_copies;
+ uint8_t copy_policy;
+ uint8_t block_size_shift;
+ uint8_t __pad1;
+ uint32_t __pad2[6];
+} SheepdogClusterRsp;
+
typedef struct SheepdogInode {
char name[SD_MAX_VDI_LEN];
char tag[SD_MAX_VDI_TAG_LEN];
@@ -1541,6 +1559,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t
*vdi_id, int snapshot,
hdr.vdi_size = s->inode.vdi_size;
hdr.copy_policy = s->inode.copy_policy;
hdr.copies = s->inode.nr_copies;
+ hdr.block_size_shift = s->inode.block_size_shift;
ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen);
@@ -1566,9 +1585,12 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t
*vdi_id, int snapshot,
static int sd_prealloc(const char *filename, Error **errp)
{
BlockDriverState *bs = NULL;
+ BDRVSheepdogState *base = NULL;
+ unsigned long buf_size;
uint32_t idx, max_idx;
+ uint32_t object_size;
int64_t vdi_size;
- void *buf = g_malloc0(SD_DATA_OBJ_SIZE);
+ void *buf = NULL;
int ret;
ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL,
@@ -1582,18 +1604,24 @@ static int sd_prealloc(const char *filename, Error
**errp)
ret = vdi_size;
goto out;
}
- max_idx = DIV_ROUND_UP(vdi_size, SD_DATA_OBJ_SIZE);
+
+ base = bs->opaque;
+ object_size = (UINT32_C(1) << base->inode.block_size_shift);
+ buf_size = MIN(object_size, SD_DATA_OBJ_SIZE);
+ buf = g_malloc0(buf_size);
+
+ max_idx = DIV_ROUND_UP(vdi_size, buf_size);
for (idx = 0; idx < max_idx; idx++) {
/*
* The created image can be a cloned image, so we need to read
* a data from the source image.
*/
- ret = bdrv_pread(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+ ret = bdrv_pread(bs, idx * buf_size, buf, buf_size);
if (ret < 0) {
goto out;
}
- ret = bdrv_pwrite(bs, idx * SD_DATA_OBJ_SIZE, buf, SD_DATA_OBJ_SIZE);
+ ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size);
if (ret < 0) {
goto out;
}
@@ -1666,6 +1694,27 @@ static int parse_redundancy(BDRVSheepdogState *s, const
char *opt)
return 0;
}
+static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt)
+{
+ struct SheepdogInode *inode = &s->inode;
+ uint64_t object_size;
+ int obj_order;
+
+ object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0);
+ if (object_size) {
+ if ((object_size - 1) & object_size) { /* not a power of 2? */
+ return -EINVAL;
+ }
+ obj_order = ffs(object_size) - 1;
+ if (obj_order < 20 || obj_order > 31) {
+ return -EINVAL;
+ }
+ inode->block_size_shift = (uint8_t)obj_order;
+ }
+
+ return 0;
+}
+
static int sd_create(const char *filename, QemuOpts *opts,
Error **errp)
{
@@ -1676,6 +1725,7 @@ static int sd_create(const char *filename, QemuOpts *opts,
BDRVSheepdogState *s;
char tag[SD_MAX_VDI_TAG_LEN];
uint32_t snapid;
+ uint64_t max_vdi_size;
bool prealloc = false;
s = g_new0(BDRVSheepdogState, 1);
@@ -1714,10 +1764,11 @@ static int sd_create(const char *filename, QemuOpts
*opts,
goto out;
}
}
-
- if (s->inode.vdi_size > SD_MAX_VDI_SIZE) {
- error_setg(errp, "too big image size");
- ret = -EINVAL;
+ ret = parse_block_size_shift(s, opts);
+ if (ret < 0) {
+ error_setg(errp, "Invalid object_size."
+ " obect_size needs to be power of 2"
+ " and be limited from 2^20 to 2^31");
goto out;
}
@@ -1754,6 +1805,51 @@ static int sd_create(const char *filename, QemuOpts
*opts,
}
s->aio_context = qemu_get_aio_context();
+
+ /* if block_size_shift is not specified, get cluster default value */
+ if (s->inode.block_size_shift == 0) {
+ SheepdogVdiReq hdr;
+ SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr;
+ Error *local_err = NULL;
+ int fd;
+ unsigned int wlen = 0, rlen = 0;
+
+ fd = connect_to_sdog(s, &local_err);
+ if (fd < 0) {
+ error_report("%s", error_get_pretty(local_err));
+ error_free(local_err);
+ ret = -EIO;
+ goto out;
+ }
+
+ memset(&hdr, 0, sizeof(hdr));
+ hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT;
+ hdr.proto_ver = SD_PROTO_VER;
+
+ ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr,
+ NULL, &wlen, &rlen);
+ closesocket(fd);
+ if (ret) {
+ error_setg_errno(errp, -ret, "failed to get cluster default");
+ goto out;
+ }
+ if (rsp->result == SD_RES_SUCCESS) {
+ s->inode.block_size_shift = rsp->block_size_shift;
+ } else {
+ s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT;
+ }
+ }
+
+ max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
+
+ if (s->inode.vdi_size > max_vdi_size) {
+ error_setg(errp, "An image is too large."
+ " The maximum image size is %"PRIu64 "GB",
+ max_vdi_size / 1024 / 1024 / 1024);
+ ret = -EINVAL;
+ goto out;
+ }
+
ret = do_sd_create(s, &vid, 0, errp);
if (ret) {
goto out;
@@ -1823,11 +1919,13 @@ static int sd_truncate(BlockDriverState *bs, int64_t
offset)
BDRVSheepdogState *s = bs->opaque;
int ret, fd;
unsigned int datalen;
+ uint64_t max_vdi_size;
+ max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS;
if (offset < s->inode.vdi_size) {
error_report("shrinking is not supported");
return -EINVAL;
- } else if (offset > SD_MAX_VDI_SIZE) {
+ } else if (offset > max_vdi_size) {
error_report("too big image size");
return -EINVAL;
}
@@ -2005,9 +2103,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
SheepdogAIOCB *acb = p;
int ret = 0;
unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE;
- unsigned long idx = acb->sector_num * BDRV_SECTOR_SIZE / SD_DATA_OBJ_SIZE;
+ unsigned long idx;
+ uint32_t object_size;
uint64_t oid;
- uint64_t offset = (acb->sector_num * BDRV_SECTOR_SIZE) % SD_DATA_OBJ_SIZE;
+ uint64_t offset;
BDRVSheepdogState *s = acb->common.bs->opaque;
SheepdogInode *inode = &s->inode;
AIOReq *aio_req;
@@ -2024,6 +2123,10 @@ static int coroutine_fn sd_co_rw_vector(void *p)
}
}
+ object_size = (UINT32_C(1) << inode->block_size_shift);
+ idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size;
+ offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size;
+
/*
* Make sure we don't free the aiocb before we are done with all requests.
* This additional reference is dropped at the end of this function.
@@ -2037,7 +2140,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
oid = vid_to_data_oid(inode->data_vdi_id[idx], idx);
- len = MIN(total - done, SD_DATA_OBJ_SIZE - offset);
+ len = MIN(total - done, object_size - offset);
switch (acb->aiocb_type) {
case AIOCB_READ_UDATA:
@@ -2061,7 +2164,7 @@ static int coroutine_fn sd_co_rw_vector(void *p)
* We discard the object only when the whole object is
* 1) allocated 2) trimmed. Otherwise, simply skip it.
*/
- if (len != SD_DATA_OBJ_SIZE || inode->data_vdi_id[idx] == 0) {
+ if (len != object_size || inode->data_vdi_id[idx] == 0) {
goto done;
}
break;
@@ -2414,6 +2517,7 @@ static int do_load_save_vmstate(BDRVSheepdogState *s,
uint8_t *data,
uint64_t offset;
uint32_t vdi_index;
uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id;
+ uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift);
fd = connect_to_sdog(s, &local_err);
if (fd < 0) {
@@ -2422,10 +2526,10 @@ static int do_load_save_vmstate(BDRVSheepdogState *s,
uint8_t *data,
}
while (remaining) {
- vdi_index = pos / SD_DATA_OBJ_SIZE;
- offset = pos % SD_DATA_OBJ_SIZE;
+ vdi_index = pos / object_size;
+ offset = pos % object_size;
- data_len = MIN(remaining, SD_DATA_OBJ_SIZE - offset);
+ data_len = MIN(remaining, object_size - offset);
vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index);
@@ -2512,10 +2616,11 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t
sector_num, int nb_sectors,
{
BDRVSheepdogState *s = bs->opaque;
SheepdogInode *inode = &s->inode;
+ uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
uint64_t offset = sector_num * BDRV_SECTOR_SIZE;
- unsigned long start = offset / SD_DATA_OBJ_SIZE,
+ unsigned long start = offset / object_size,
end = DIV_ROUND_UP((sector_num + nb_sectors) *
- BDRV_SECTOR_SIZE, SD_DATA_OBJ_SIZE);
+ BDRV_SECTOR_SIZE, object_size);
unsigned long idx;
int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset;
@@ -2534,7 +2639,7 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t
sector_num, int nb_sectors,
}
}
- *pnum = (idx - start) * SD_DATA_OBJ_SIZE / BDRV_SECTOR_SIZE;
+ *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE;
if (*pnum > nb_sectors) {
*pnum = nb_sectors;
}
@@ -2545,14 +2650,15 @@ static int64_t
sd_get_allocated_file_size(BlockDriverState *bs)
{
BDRVSheepdogState *s = bs->opaque;
SheepdogInode *inode = &s->inode;
- unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, SD_DATA_OBJ_SIZE);
+ uint32_t object_size = (UINT32_C(1) << inode->block_size_shift);
+ unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size);
uint64_t size = 0;
for (i = 0; i < last; i++) {
if (inode->data_vdi_id[i] == 0) {
continue;
}
- size += SD_DATA_OBJ_SIZE;
+ size += object_size;
}
return size;
}
@@ -2581,6 +2687,11 @@ static QemuOptsList sd_create_opts = {
.type = QEMU_OPT_STRING,
.help = "Redundancy of the image"
},
+ {
+ .name = BLOCK_OPT_OBJECT_SIZE,
+ .type = QEMU_OPT_SIZE,
+ .help = "Object size of the image"
+ },
{ /* end of list */ }
}
};
diff --git a/include/block/block_int.h b/include/block/block_int.h
index b340e7e..9f7b5bd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -56,6 +56,7 @@
#define BLOCK_OPT_ADAPTER_TYPE "adapter_type"
#define BLOCK_OPT_REDUNDANCY "redundancy"
#define BLOCK_OPT_NOCOW "nocow"
+#define BLOCK_OPT_OBJECT_SIZE "object_size"
#define BLOCK_PROBE_BUF_SIZE 512
--
2.1.0
- [Qemu-devel] [PULL 02/69] coroutine: Clean up qemu_coroutine_enter(), (continued)
- [Qemu-devel] [PULL 02/69] coroutine: Clean up qemu_coroutine_enter(), Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 04/69] vpc: Fix size in fixed image creation, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 07/69] block/raw-posix: fix compilation warning on OSX, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 05/69] vpc: Implement bdrv_co_get_block_status(), Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 08/69] qcow2: Remove unused struct QCowCreateState, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 09/69] virtio-blk: Check return value of blk_aio_ioctl, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 11/69] tests: Prepare virtio-blk-test for multi-arch implementation, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 10/69] libqos: Change use of pointers to uint64_t in virtio, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 12/69] libqos: Remove PCI assumptions in constants of virtio driver, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 13/69] libqos: Add malloc generic, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 06/69] sheepdog: selectable object size support,
Stefan Hajnoczi <=
- [Qemu-devel] [PULL 16/69] qcow2: Add refcount_bits to format-specific info, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 14/69] libqos: Add virtio MMIO support, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 15/69] qcow2: Add two new fields to BDRVQcowState, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 18/69] qcow2: Only return status from qcow2_get_refcount, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 20/69] qcow2: Use 64 bits for refcount values, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 17/69] qcow2: Do not return new value after refcount update, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 19/69] qcow2: Use unsigned addend for update_refcount(), Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 21/69] qcow2: Helper for refcount array reallocation, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 22/69] qcow2: Helper function for refcount modification, Stefan Hajnoczi, 2015/02/27
- [Qemu-devel] [PULL 24/69] qcow2: Open images with refcount order != 4, Stefan Hajnoczi, 2015/02/27