[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: [Qemu-devel] [RFC PATCH RDMA support v2: 2/6] create migration-rdma.
From: |
Michael S. Tsirkin |
Subject: |
Re: [Qemu-devel] [RFC PATCH RDMA support v2: 2/6] create migration-rdma.c for core RDMA migration code |
Date: |
Thu, 21 Feb 2013 22:06:26 +0200 |
On Mon, Feb 11, 2013 at 05:49:53PM -0500, Michael R. Hines wrote:
> From: "Michael R. Hines" <address@hidden>
>
>
> Signed-off-by: Michael R. Hines <address@hidden>
> ---
> include/qemu/rdma.h | 281 ++++++++++
> migration-rdma.c | 1444
> +++++++++++++++++++++++++++++++++++++++++++++++++++
> 2 files changed, 1725 insertions(+)
> create mode 100644 include/qemu/rdma.h
> create mode 100644 migration-rdma.c
Could you document the protocol used, including
sync using send etc?
A good place to put it would be docs/
> diff --git a/include/qemu/rdma.h b/include/qemu/rdma.h
> new file mode 100644
> index 0000000..2dc2519
> --- /dev/null
> +++ b/include/qemu/rdma.h
> @@ -0,0 +1,281 @@
> +/*
> + * Copyright (C) 2013 Michael R. Hines <address@hidden>
> + * Copyright (C) 2013 Jiuxing Liu <address@hidden>
> + *
> + * RDMA data structures and helper functions (for migration)
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; under version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef _RDMA_H
> +#define _RDMA_H
> +
> +#include "config-host.h"
> +#ifdef CONFIG_RDMA
> +#include <rdma/rdma_cma.h>
> +#endif
> +#include "monitor/monitor.h"
> +
> +extern int rdmaport;
> +extern char rdmahost[64];
> +
> +struct rdma_context {
> + /* cm_id also has ibv_conext, rdma_event_channel, and ibv_qp in
> + cm_id->verbs, cm_id->channel, and cm_id->qp. */
> + struct rdma_cm_id *cm_id;
> + struct rdma_cm_id *listen_id;
> +
> + struct ibv_context *verbs;
> + struct rdma_event_channel *channel;
> + struct ibv_qp *qp;
> +
> + struct ibv_comp_channel *comp_channel;
> + struct ibv_pd *pd;
> + struct ibv_cq *cq;
> +};
> +
> +static inline void rdma_init_context(struct rdma_context *rdma_ctx)
> +{
> + rdma_ctx->cm_id = NULL;
> + rdma_ctx->listen_id = NULL;
> + rdma_ctx->verbs = NULL;
> + rdma_ctx->channel = NULL;
> + rdma_ctx->qp = NULL;
> + rdma_ctx->comp_channel = NULL;
> + rdma_ctx->pd = NULL;
> + rdma_ctx->cq = NULL;
> +}
> +
> +void cpu_physical_memory_reset_dirty_all(void);
> +
> +int rdma_resolve_host(struct rdma_context *rdma_ctx,
> + const char *host, int port);
> +int rdma_alloc_pd_cq(struct rdma_context *rdma_ctx);
> +int rdma_alloc_qp(struct rdma_context *rdma_ctx);
> +int rdma_migrate_connect(struct rdma_context *rdma_ctx,
> + void *in_data, int *in_len, void *out_data, int out_len);
> +int rdma_migrate_accept(struct rdma_context *rdma_ctx,
> + void *in_data, int *in_len, void *out_data, int out_len);
> +void rdma_migrate_disconnect(struct rdma_context *rdma_ctx);
> +
> +/* Instead of registering whole ram blocks, we can register them in smaller
> + * chunks. This may be benefial if the ram blocks have holes in them */
> +#define RDMA_CHUNK_REGISTRATION
> +
> +#define RDMA_LAZY_REGISTRATION
> +
> +#define RDMA_REG_CHUNK_SHIFT 20
> +#define RDMA_REG_CHUNK_SIZE (1UL << (RDMA_REG_CHUNK_SHIFT))
> +#define RDMA_REG_CHUNK_INDEX(start_addr, host_addr) \
> + (((unsigned long)(host_addr) >> RDMA_REG_CHUNK_SHIFT) - \
> + ((unsigned long)(start_addr) >> RDMA_REG_CHUNK_SHIFT))
> +#define RDMA_REG_NUM_CHUNKS(rdma_ram_block) \
> + (RDMA_REG_CHUNK_INDEX((rdma_ram_block)->local_host_addr,\
> + (rdma_ram_block)->local_host_addr +\
> + (rdma_ram_block)->length) + 1)
> +#define RDMA_REG_CHUNK_START(rdma_ram_block, i) ((uint8_t *)\
> + ((((unsigned long)((rdma_ram_block)->local_host_addr) >> \
> + RDMA_REG_CHUNK_SHIFT) + (i)) << \
> + RDMA_REG_CHUNK_SHIFT))
> +#define RDMA_REG_CHUNK_END(rdma_ram_block, i) \
> + (RDMA_REG_CHUNK_START(rdma_ram_block, i) + \
> + RDMA_REG_CHUNK_SIZE)
> +
> +struct rdma_ram_block {
> + uint8_t *local_host_addr;
> + uint64_t remote_host_addr;
> + uint64_t offset;
> + uint64_t length;
> + struct ibv_mr **pmr;
> + struct ibv_mr *mr;
> + uint32_t remote_rkey;
> +};
> +
> +struct rdma_remote_ram_block {
> + uint64_t remote_host_addr;
> + uint64_t offset;
> + uint64_t length;
> + uint32_t remote_rkey;
> +};
> +
> +#define RDMA_MAX_RAM_BLOCKS 64
> +
> +struct rdma_ram_blocks {
> + int num_blocks;
> + struct rdma_ram_block block[RDMA_MAX_RAM_BLOCKS];
> +};
> +
> +struct rdma_remote_ram_blocks {
> + int num_blocks;
> + struct rdma_remote_ram_block block[RDMA_MAX_RAM_BLOCKS];
> +};
> +
> +int rdma_init_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks);
> +int rdma_reg_chunk_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks);
> +int rdma_reg_whole_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks);
> +int rdma_server_reg_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks);
> +int rdma_client_reg_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks);
> +void rdma_dereg_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks);
> +
> +void rdma_copy_to_remote_ram_blocks(struct rdma_ram_blocks *local,
> + struct rdma_remote_ram_blocks *remote);
> +int rdma_process_remote_ram_blocks(struct rdma_ram_blocks *local,
> + struct rdma_remote_ram_blocks *remote);
> +
> +int rdma_search_ram_block(uint64_t offset, uint64_t length,
> + struct rdma_ram_blocks *blocks,
> + int *block_index, int *chunk_index);
> +
> +struct rdma_data {
> + char *host;
> + int port;
> + int enabled;
> + int gidx;
> + union ibv_gid gid;
> +
> + struct rdma_context rdma_ctx;
> + struct rdma_ram_blocks rdma_ram_blocks;
> +
> + /* This is used for synchronization: We use
> + IBV_WR_SEND to send it after all IBV_WR_RDMA_WRITEs
> + are done. When the receiver gets it, it can be certain
> + that all the RDMAs are completed. */
> + int sync;
> + struct ibv_mr *sync_mr;
> +
> + /* This is used for the server to write the remote
> + ram blocks info. */
> + struct rdma_remote_ram_blocks remote_info;
> + struct ibv_mr *remote_info_mr;
> +
> + /* The rest is only for the initiator of the migration. */
> + int client_init_done;
> +
> + /* number of outstanding unsignaled send */
> + int num_unsignaled_send;
> +
> + /* number of outstanding signaled send */
> + int num_signaled_send;
> +
> + /* store info about current buffer so that we can
> + merge it with future sends */
> + uint64_t current_offset;
> + uint64_t current_length;
> + /* index of ram block the current buffer belongs to */
> + int current_index;
> + /* index of the chunk in the current ram block */
> + int current_chunk;
> +
> + uint64_t total_bytes;
> +
> +};
> +
> +extern struct rdma_data rdma_mdata;
> +
> +#define rdma_update_capability(state) \
> + state->enabled_capabilities[MIGRATION_CAPABILITY_RDMA] = \
> + migration_use_rdma() ? true : false
> +
> +static inline int migration_use_rdma(void)
> +{
> + /* port will be non-zero if user wants to use RDMA. */
> + return rdma_mdata.port != -1 && rdma_mdata.host;
> +}
> +
> +static inline int migrate_rdma_enabled(void)
> +{
> + return rdma_mdata.enabled;
> +}
> +
> +void rdma_disable(void);
> +
> +#define RDMA_BLOCKING
> +#define RDMA_EXTRA_SYNC
> +
> +enum {
> + RDMA_WRID_NONE = 0,
> + RDMA_WRID_RDMA,
> + RDMA_WRID_SEND_SYNC,
> + RDMA_WRID_RECV_SYNC,
> + RDMA_WRID_SEND_REMOTE_INFO,
> + RDMA_WRID_RECV_REMOTE_INFO,
> + RDMA_WRID_SEND_EXTRA_SYNC,
> + RDMA_WRID_RECV_EXTRA_SYNC,
> +};
> +
> +int rdma_migrate_listen(struct rdma_data *mdata, char *host,
> + int port);
> +int rdma_reg_sync(struct rdma_data *mdata);
> +int rdma_dereg_sync(struct rdma_data *mdata);
> +int rdma_post_recv_sync(struct rdma_data *mdata,
> + int wr_id);
> +
> +int rdma_reg_remote_info(
> + struct rdma_data *mdata);
> +int rdma_dereg_remote_info(
> + struct rdma_data *mdata);
> +int rdma_post_send_remote_info(
> + struct rdma_data *mdata);
> +int rdma_post_recv_remote_info(
> + struct rdma_data *mdata);
> +
> +int rdma_poll_for_wrid(
> + struct rdma_data *mdata,
> + int wrid);
> +int rdma_block_for_wrid(
> + struct rdma_data *mdata,
> + int wrid);
> +
> +
> +#ifdef CONFIG_RDMA
> +int rdma_wait_for_connect(int fd, void * opaque);
> +int rdma_start_incoming_migration(int s);
> +void rdma_cleanup(struct rdma_data *mdata);
> +int rdma_client_init(struct rdma_data *mdata);
> +int rdma_client_connect(struct rdma_data *mdata);
> +void rdma_data_init(struct rdma_data *mdata);
> +int rdma_server_init(struct rdma_data *mdata);
> +int rdma_server_prepare(struct rdma_data *mdata);
> +int rdma_accept_incoming_migration(
> + struct rdma_data *mdata);
> +int rdma_write(struct rdma_data *mdata,
> + uint64_t addr, uint64_t len);
> +int rdma_write_flush(struct rdma_data *mdata);
> +int rdma_poll(struct rdma_data *mdata);
> +int rdma_post_send_sync(struct rdma_data *mdata,
> + int wr_id);
> +int rdma_wait_for_wrid(
> + struct rdma_data *mdata,
> + int wrid);
> +#else
> +#define rdma_cleanup(...) do { printf("WARN: rdma not enabled\n"); } while(0)
> +#define rdma_data_init(...) do { printf("WARN: rdma not enabled\n"); }
> while(0)
> +#define rdma_wait_for_connect(...) 0
> +#define rdma_start_incoming_migration(...) 0
> +#define rdma_client_init(...) 0
> +#define rdma_client_connect(...) 0
> +#define rdma_server_init(...) 0
> +#define rdma_server_prepare(...) 0
> +#define rdma_accept_incoming_migration(...) 0
> +#define rdma_write(...) 0
> +#define rdma_write_flush(...) 0
> +#define rdma_poll(...) 0
> +#define rdma_post_send_sync(...) 0
> +#define rdma_wait_for_wrid(...) 0
> +#endif
> +
> +#endif
> diff --git a/migration-rdma.c b/migration-rdma.c
> new file mode 100644
> index 0000000..a64f350
> --- /dev/null
> +++ b/migration-rdma.c
> @@ -0,0 +1,1444 @@
> +/*
> + * Copyright (C) 2013 Michael R. Hines <address@hidden>
> + * Copyright (C) 2013 Jiuxing Liu <address@hidden>
> + *
> + * RDMA data structures and helper functions (for migration)
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; under version 2 of the License.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, see <http://www.gnu.org/licenses/>.
> + */
> +#include "qemu/rdma.h"
> +#include "qemu-common.h"
> +#include "migration/migration.h"
> +#include <stdio.h>
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <netdb.h>
> +#include <arpa/inet.h>
> +#include <string.h>
> +
> +#define RDMA_RESOLVE_TIMEOUT_MS 10000
> +#define RDMA_CQ_SIZE 2000
> +#define RDMA_QP_SIZE 1000
> +
> +//#define DEBUG_MIGRATION_RDMA
> +
> +#ifdef DEBUG_MIGRATION_RDMA
> +#define DPRINTF(fmt, ...) \
> + do { printf("migration-rdma: " fmt, ## __VA_ARGS__); } while (0)
> +#else
> +#define DPRINTF(fmt, ...) \
> + do { } while (0)
> +#endif
> +
> +static void *rdma_mallocz(size_t size)
> +{
> + void *ptr;
> + ptr = malloc(size);
> + memset(ptr, 0, size);
> + return ptr;
> +}
> +
> +static void rdma_dump_id(const char * who, struct ibv_context * verbs)
> +{
> + printf("%s RDMA verbs Device opened: kernel name %s uverbs device name
> %s, infiniband_verbs class device path %s, infiniband class device path
> %s\n", who, verbs->device->name, verbs->device->dev_name,
> verbs->device->dev_path, verbs->device->ibdev_path);
> +}
> +
> +static void rdma_dump_gid(const char * who, struct rdma_cm_id * id)
> +{
> + char sgid[33];
> + char dgid[33];
> + inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
> + inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
> + printf("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid);
> +}
> +
> +int rdma_resolve_host(struct rdma_context *rdma_ctx,
> + const char *host, int port)
> +{
> + int ret;
> + struct addrinfo *res, *src;
> + char port_str[16];
> + struct rdma_cm_event *cm_event;
> + const char * srchost = "r9";
> +
> +
> + if (host == NULL || !strcmp(host, "")) {
> + printf("RDMA hostname has not been set\n");
> + return -1;
> + }
> +
> + /* create CM channel */
> + rdma_ctx->channel = rdma_create_event_channel();
> + if (!rdma_ctx->channel) {
> + printf("could not create CM channel\n");
> + return -1;
> + }
> +
> + /* create CM id */
> + ret = rdma_create_id(rdma_ctx->channel, &rdma_ctx->cm_id, NULL,
> + RDMA_PS_TCP);
> + if (ret) {
> + printf("could not create channel id\n");
> + goto err_resolve_create_id;
> + }
> +
> + snprintf(port_str, 16, "%d", port);
> + port_str[15] = '\0';
> +
> + ret = getaddrinfo(host, port_str, NULL, &res);
> + if (ret < 0) {
> + printf("could not getaddrinfo destination address %s\n", host);
> + goto err_resolve_get_addr;
> + }
> +
> + ret = getaddrinfo(srchost, port_str, NULL, &src);
> + if (ret < 0) {
> + printf("could not getaddrinfo source address %s\n", srchost);
> + goto err_resolve_get_addr;
> + }
> +
> + /* There may be multiple RDMA devices. Choose the right one.
> + * (We may need to select specific ports in the future, too.)
> + */
> + ret = rdma_bind_addr(rdma_ctx->cm_id, src->ai_addr);
> + if(ret < 0) {
> + printf("could not bind source client to local rdma device with src
> %s\n", srchost);
> + goto err_resolve_get_addr;
> + }
> +
> + rdma_dump_id("client_bind", rdma_ctx->cm_id->verbs);
> + rdma_dump_gid("client_bind", rdma_ctx->cm_id);
> +
> + /* resolve the first address */
> + ret = rdma_resolve_addr(rdma_ctx->cm_id, NULL, res->ai_addr,
> + RDMA_RESOLVE_TIMEOUT_MS);
> + if (ret) {
> + printf("could not resolve address %s\n", host);
> + goto err_resolve_get_addr;
> + }
> +
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + printf("could not perform event_addr_resolved\n");
> + goto err_resolve_get_addr;
> + }
> +
> + if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
> + printf("result not equal to event_addr_resolved %s\n",
> rdma_event_str(cm_event->event));
> + perror("rdma_resolve_addr");
> + rdma_ack_cm_event(cm_event);
> + goto err_resolve_get_addr;
> + }
> + rdma_ack_cm_event(cm_event);
> + rdma_dump_gid("client_resolved", rdma_ctx->cm_id);
> +
> + /* resolve route */
> + ret = rdma_resolve_route(rdma_ctx->cm_id, RDMA_RESOLVE_TIMEOUT_MS);
> + if (ret) {
> + printf("could not resolve rdma route\n");
> + goto err_resolve_get_addr;
> + }
> +
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + printf("could not perform event_route_resolved\n");
> + goto err_resolve_get_addr;
> + }
> + if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
> + printf("result not equal to event_route_resolved: %s\n",
> rdma_event_str(cm_event->event));
> + rdma_ack_cm_event(cm_event);
> + goto err_resolve_get_addr;
> + }
> + rdma_ack_cm_event(cm_event);
> + rdma_ctx->verbs = rdma_ctx->cm_id->verbs;
> + return 0;
> +
> +err_resolve_get_addr:
> + rdma_destroy_id(rdma_ctx->cm_id);
> +err_resolve_create_id:
> + rdma_destroy_event_channel(rdma_ctx->channel);
> + rdma_ctx->channel = NULL;
> +
> + return -1;
> +}
> +
> +int rdma_alloc_pd_cq(struct rdma_context *rdma_ctx)
> +{
> +
> + /* allocate pd */
> + rdma_ctx->pd = ibv_alloc_pd(rdma_ctx->verbs);
> + if (!rdma_ctx->pd) {
> + return -1;
> + }
> +
> +#ifdef RDMA_BLOCKING
> + /* create completion channel */
> + rdma_ctx->comp_channel = ibv_create_comp_channel(rdma_ctx->verbs);
> + if (!rdma_ctx->comp_channel) {
> + goto err_alloc_pd_cq;
> + }
> +#endif
> +
> + /* create cq */
> + rdma_ctx->cq = ibv_create_cq(rdma_ctx->verbs, RDMA_CQ_SIZE,
> + NULL, rdma_ctx->comp_channel, 0);
> + if (!rdma_ctx->cq) {
> + goto err_alloc_pd_cq;
> + }
> +
> + return 0;
> +
> +err_alloc_pd_cq:
> + if (rdma_ctx->pd) {
> + ibv_dealloc_pd(rdma_ctx->pd);
> + }
> + if (rdma_ctx->comp_channel) {
> + ibv_destroy_comp_channel(rdma_ctx->comp_channel);
> + }
> + rdma_ctx->pd = NULL;
> + rdma_ctx->comp_channel = NULL;
> + return -1;
> +
> +}
> +
> +int rdma_alloc_qp(struct rdma_context *rdma_ctx)
> +{
> + struct ibv_qp_init_attr attr = { 0 };
> + int ret;
> +
> + attr.cap.max_send_wr = RDMA_QP_SIZE;
> + attr.cap.max_recv_wr = 2;
> + attr.cap.max_send_sge = 1;
> + attr.cap.max_recv_sge = 1;
> + attr.send_cq = rdma_ctx->cq;
> + attr.recv_cq = rdma_ctx->cq;
> + attr.qp_type = IBV_QPT_RC;
> +
> + ret = rdma_create_qp(rdma_ctx->cm_id, rdma_ctx->pd, &attr);
> + if (ret) {
> + return -1;
> + }
> +
> + rdma_ctx->qp = rdma_ctx->cm_id->qp;
> + return 0;
> +}
> +
> +int rdma_wait_for_connect(int fd, void * opaque)
> +{
> + if (rdma_client_init(&rdma_mdata)) {
> + return -1;
> + }
> +
> + if (rdma_client_connect(&rdma_mdata)) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +int rdma_migrate_connect(struct rdma_context *rdma_ctx,
> + void *in_data, int *in_len, void *out_data, int out_len)
> +{
> + int ret;
> + struct rdma_conn_param conn_param = { 0 };
> + struct rdma_cm_event *cm_event;
> +
> + conn_param.initiator_depth = 2;
> + conn_param.retry_count = 5;
> + conn_param.private_data = out_data;
> + conn_param.private_data_len = out_len;
> +
> + ret = rdma_connect(rdma_ctx->cm_id, &conn_param);
> + if (ret) {
> + return -1;
> + }
> +
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + return -1;
> + }
> + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
> + return -1;
> + }
> +
> + if (in_len) {
> + if (*in_len > cm_event->param.conn.private_data_len) {
> + *in_len = cm_event->param.conn.private_data_len;
> + }
> + if (*in_len) {
> + memcpy(in_data, cm_event->param.conn.private_data, *in_len);
> + }
> + }
> +
> + rdma_ack_cm_event(cm_event);
> +
> + return 0;
> +}
> +
> +int rdma_migrate_listen(struct rdma_data *mdata, char *host,
> + int port)
> +{
> + int ret;
> + struct rdma_cm_event *cm_event;
> + struct rdma_context *rdma_ctx = &mdata->rdma_ctx;
> + struct ibv_context *verbs;
> +
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + goto err_listen;
> + }
> +
> + if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
> + rdma_ack_cm_event(cm_event);
> + goto err_listen;
> + }
> +
> + rdma_ctx->cm_id = cm_event->id;
> + verbs = cm_event->id->verbs;
> + printf("verbs context after listen: %p\n", verbs);
> + rdma_ack_cm_event(cm_event);
> +
> + if (!rdma_ctx->verbs) {
> + rdma_ctx->verbs = verbs;
> + ret = rdma_server_prepare(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error preparing server!\n");
> + goto err_listen;
> + }
> + } else if (rdma_ctx->verbs != verbs) {
> + fprintf(stderr, "ibv context not matching %p, %p!\n",
> + rdma_ctx->verbs, verbs);
> + goto err_listen;
> + }
> + /* xxx destroy listen_id ??? */
> +
> + return 0;
> +
> +err_listen:
> +
> + return -1;
> +
> +}
> +
> +int rdma_migrate_accept(struct rdma_context *rdma_ctx,
> + void *in_data, int *in_len, void *out_data, int out_len)
> +{
> + int ret;
> + struct rdma_conn_param conn_param = { 0 };
> + struct rdma_cm_event *cm_event;
> +
> + conn_param.responder_resources = 2;
> + conn_param.private_data = out_data;
> + conn_param.private_data_len = out_len;
> +
> + ret = rdma_accept(rdma_ctx->cm_id, &conn_param);
> + if (ret) {
> + fprintf(stderr, "rdma_accept returns %d!\n", ret);
> + return -1;
> + }
> +
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + return -1;
> + }
> +
> + if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
> + rdma_ack_cm_event(cm_event);
> + return -1;
> + }
> +
> + if (in_len) {
> + if (*in_len > cm_event->param.conn.private_data_len) {
> + *in_len = cm_event->param.conn.private_data_len;
> + }
> + if (*in_len) {
> + memcpy(in_data, cm_event->param.conn.private_data, *in_len);
> + }
> + }
> +
> + rdma_ack_cm_event(cm_event);
> +
> + return 0;
> +}
> +
> +void rdma_migrate_disconnect(struct rdma_context *rdma_ctx)
> +{
> + int ret;
> + struct rdma_cm_event *cm_event;
> +
> + ret = rdma_disconnect(rdma_ctx->cm_id);
> + if (ret) {
> + return;
> + }
> + ret = rdma_get_cm_event(rdma_ctx->channel, &cm_event);
> + if (ret) {
> + return;
> + }
> + rdma_ack_cm_event(cm_event);
> +}
> +
> +int rdma_reg_chunk_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks)
> +{
> + int i, j;
> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> + struct rdma_ram_block *block = &(rdma_ram_blocks->block[i]);
> + int num_chunks = RDMA_REG_NUM_CHUNKS(block);
> + /* allocate memory to store chunk MRs */
> + rdma_ram_blocks->block[i].pmr = rdma_mallocz(
> + num_chunks * sizeof(struct ibv_mr *));
> +
> + if (!block->pmr) {
> + goto err_reg_chunk_ram_blocks;
> + }
> +
> + for (j = 0; j < num_chunks; j++) {
> + uint8_t *start_addr = RDMA_REG_CHUNK_START(block, j);
> + uint8_t *end_addr = RDMA_REG_CHUNK_END(block, j);
> + if (start_addr < block->local_host_addr) {
> + start_addr = block->local_host_addr;
> + }
> + if (end_addr > block->local_host_addr + block->length) {
> + end_addr = block->local_host_addr + block->length;
> + }
> + block->pmr[j] = ibv_reg_mr(rdma_ctx->pd,
> + start_addr,
> + end_addr - start_addr,
> + IBV_ACCESS_LOCAL_WRITE |
> + IBV_ACCESS_REMOTE_WRITE |
> + IBV_ACCESS_REMOTE_READ);
> + if (!block->pmr[j]) {
> + break;
> + }
> + }
> + if (j < num_chunks) {
> + for (j--; j >= 0; j--) {
> + ibv_dereg_mr(block->pmr[j]);
> + }
> + block->pmr[i] = NULL;
> + goto err_reg_chunk_ram_blocks;
> + }
> + }
> +
> + return 0;
> +
> +err_reg_chunk_ram_blocks:
> + for (i--; i >= 0; i--) {
> + int num_chunks =
> + RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
> + for (j = 0; j < num_chunks; j++) {
> + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
> + }
> + free(rdma_ram_blocks->block[i].pmr);
> + rdma_ram_blocks->block[i].pmr = NULL;
> + }
> +
> + return -1;
> +
> +}
> +
> +int rdma_reg_whole_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks)
> +{
> + int i;
> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> + rdma_ram_blocks->block[i].mr =
> + ibv_reg_mr(rdma_ctx->pd,
> + rdma_ram_blocks->block[i].local_host_addr,
> + rdma_ram_blocks->block[i].length,
> + IBV_ACCESS_LOCAL_WRITE |
> + IBV_ACCESS_REMOTE_WRITE |
> + IBV_ACCESS_REMOTE_READ);
> + if (!rdma_ram_blocks->block[i].mr) {
> + break;
> + }
> + }
> +
> + if (i >= rdma_ram_blocks->num_blocks) {
> + return 0;
> + }
> +
> + for (i--; i >= 0; i--) {
> + ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
> + }
> +
> + return -1;
> +
> +}
> +
> +int rdma_client_reg_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks)
> +{
> +#ifdef RDMA_CHUNK_REGISTRATION
> +#ifdef RDMA_LAZY_REGISTRATION
> + return 0;
> +#else
> + return rdma_reg_chunk_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +#endif
> +#else
> + return rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +#endif
> +}
> +
> +int rdma_server_reg_ram_blocks(struct rdma_context *rdma_ctx,
> + struct rdma_ram_blocks *rdma_ram_blocks)
> +{
> + return rdma_reg_whole_ram_blocks(rdma_ctx, rdma_ram_blocks);
> +}
> +
> +void rdma_dereg_ram_blocks(struct rdma_ram_blocks *rdma_ram_blocks)
> +{
> + int i, j;
> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> + int num_chunks;
> + if (!rdma_ram_blocks->block[i].pmr) {
> + continue;
> + }
> + num_chunks = RDMA_REG_NUM_CHUNKS(&(rdma_ram_blocks->block[i]));
> + for (j = 0; j < num_chunks; j++) {
> + if (!rdma_ram_blocks->block[i].pmr[j]) {
> + continue;
> + }
> + ibv_dereg_mr(rdma_ram_blocks->block[i].pmr[j]);
> + }
> + free(rdma_ram_blocks->block[i].pmr);
> + rdma_ram_blocks->block[i].pmr = NULL;
> + }
> + for (i = 0; i < rdma_ram_blocks->num_blocks; i++) {
> + if (!rdma_ram_blocks->block[i].mr) {
> + continue;
> + }
> + ibv_dereg_mr(rdma_ram_blocks->block[i].mr);
> + rdma_ram_blocks->block[i].mr = NULL;
> + }
> +}
> +
> +void rdma_copy_to_remote_ram_blocks(struct rdma_ram_blocks *local,
> + struct rdma_remote_ram_blocks *remote)
> +{
> + int i;
> + remote->num_blocks = local->num_blocks;
> + for (i = 0; i < local->num_blocks; i++) {
> + remote->block[i].remote_host_addr =
> + (uint64_t)(local->block[i].local_host_addr);
> + remote->block[i].remote_rkey = local->block[i].mr->rkey;
> + remote->block[i].offset = local->block[i].offset;
> + remote->block[i].length = local->block[i].length;
> + }
> +}
> +
> +int rdma_process_remote_ram_blocks(struct rdma_ram_blocks *local,
> + struct rdma_remote_ram_blocks *remote)
> +{
> + int i, j;
> +
> + if (local->num_blocks != remote->num_blocks) {
> + return -1;
> + }
> +
> + for (i = 0; i < remote->num_blocks; i++) {
> + /* search local ram blocks */
> + for (j = 0; j < local->num_blocks; j++) {
> + if (remote->block[i].offset != local->block[j].offset) {
> + continue;
> + }
> + if (remote->block[i].length != local->block[j].length) {
> + return -1;
> + }
> + local->block[j].remote_host_addr =
> + remote->block[i].remote_host_addr;
> + local->block[j].remote_rkey = remote->block[i].remote_rkey;
> + break;
> + }
> + if (j >= local->num_blocks) {
> + return -1;
> + }
> + }
> +
> + return 0;
> +}
> +
> +int rdma_search_ram_block(uint64_t offset, uint64_t length,
> + struct rdma_ram_blocks *blocks,
> + int *block_index, int *chunk_index)
> +{
> + int i;
> + for (i = 0; i < blocks->num_blocks; i++) {
> + if (offset < blocks->block[i].offset) {
> + continue;
> + }
> + if (offset + length >
> + blocks->block[i].offset + blocks->block[i].length) {
> + continue;
> + }
> + *block_index = i;
> + if (chunk_index) {
> + uint8_t *host_addr = blocks->block[i].local_host_addr +
> + (offset - blocks->block[i].offset);
> + *chunk_index = RDMA_REG_CHUNK_INDEX(
> + blocks->block[i].local_host_addr, host_addr);
> + }
> + return 0;
> + }
> + return -1;
> +}
> +
> +static int rdma_get_lkey(struct rdma_context *rdma_ctx,
> + struct rdma_ram_block *block, uint64_t host_addr,
> + uint32_t *lkey)
> +{
> + int chunk;
> + if (block->mr) {
> + *lkey = block->mr->lkey;
> + return 0;
> + }
> + if (!block->pmr) {
> + int num_chunks = RDMA_REG_NUM_CHUNKS(block);
> + /* allocate memory to store chunk MRs */
> + block->pmr = rdma_mallocz(num_chunks *
> + sizeof(struct ibv_mr *));
> + if (!block->pmr) {
> + return -1;
> + }
> + }
> + chunk = RDMA_REG_CHUNK_INDEX(block->local_host_addr, host_addr);
> + if (!block->pmr[chunk]) {
> + uint8_t *start_addr = RDMA_REG_CHUNK_START(block, chunk);
> + uint8_t *end_addr = RDMA_REG_CHUNK_END(block, chunk);
> + if (start_addr < block->local_host_addr) {
> + start_addr = block->local_host_addr;
> + }
> + if (end_addr > block->local_host_addr + block->length) {
> + end_addr = block->local_host_addr + block->length;
> + }
> + block->pmr[chunk] = ibv_reg_mr(rdma_ctx->pd,
> + start_addr,
> + end_addr - start_addr,
> + IBV_ACCESS_LOCAL_WRITE |
> + IBV_ACCESS_REMOTE_WRITE |
> + IBV_ACCESS_REMOTE_READ);
> + if (!block->pmr[chunk]) {
> + return -1;
> + }
> + }
> + *lkey = block->pmr[chunk]->lkey;
> + return 0;
> +}
> +
> +/* Do not merge data if larger than this. */
> +#define RDMA_MERGE_MAX (4 * 1024 * 1024)
> +
> +#define RDMA_UNSIGNALED_SEND_MAX 64
> +
> +int rdma_reg_sync(struct rdma_data *mdata)
> +{
> + mdata->sync_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
> + &mdata->sync,
> + sizeof mdata->sync,
> + IBV_ACCESS_LOCAL_WRITE |
> + IBV_ACCESS_REMOTE_WRITE |
> + IBV_ACCESS_REMOTE_READ);
> + if (mdata->sync_mr) {
> + return 0;
> + }
> + return -1;
> +}
> +
> +int rdma_dereg_sync(struct rdma_data *mdata)
> +{
> + return ibv_dereg_mr(mdata->sync_mr);
> +}
> +
> +int rdma_reg_remote_info(
> + struct rdma_data *mdata)
> +{
> + mdata->remote_info_mr = ibv_reg_mr(mdata->rdma_ctx.pd,
> + &mdata->remote_info,
> + sizeof mdata->remote_info,
> + IBV_ACCESS_LOCAL_WRITE |
> + IBV_ACCESS_REMOTE_WRITE |
> + IBV_ACCESS_REMOTE_READ);
> + if (mdata->remote_info_mr) {
> + return 0;
> + }
> + return -1;
> +}
> +
> +int rdma_dereg_remote_info(
> + struct rdma_data *mdata)
> +{
> + return ibv_dereg_mr(mdata->remote_info_mr);
> +}
> +
> +
> +int rdma_post_send_sync(struct rdma_data *mdata,
> + int wr_id)
> +{
> + struct ibv_sge sge;
> + struct ibv_send_wr send_wr = { 0 };
> + struct ibv_send_wr *bad_wr;
> +
> + mdata->sync = 1;
> +
> + sge.addr = (uint64_t)(&mdata->sync);
> + sge.length = sizeof mdata->sync;
> + sge.lkey = mdata->sync_mr->lkey;
> +
> + send_wr.wr_id = wr_id;
> + send_wr.opcode = IBV_WR_SEND;
> + send_wr.send_flags = IBV_SEND_SIGNALED;
> + send_wr.sg_list = &sge;
> + send_wr.num_sge = 1;
> +
> + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +int rdma_post_recv_sync(struct rdma_data *mdata,
> + int wr_id)
> +{
> + struct ibv_sge sge;
> + struct ibv_recv_wr recv_wr = { 0 };
> + struct ibv_recv_wr *bad_wr;
> +
> + mdata->sync = 1;
> +
> + sge.addr = (uint64_t)(&mdata->sync);
> + sge.length = sizeof mdata->sync;
> + sge.lkey = mdata->sync_mr->lkey;
> +
> + recv_wr.wr_id = wr_id;
> + recv_wr.sg_list = &sge;
> + recv_wr.num_sge = 1;
> +
> + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
> + return -1;
> + }
> +
> + return 0;
> +
> +}
> +
> +int rdma_post_send_remote_info(
> + struct rdma_data *mdata)
> +{
> + struct ibv_sge sge;
> + struct ibv_send_wr send_wr = { 0 };
> + struct ibv_send_wr *bad_wr;
> +
> + sge.addr = (uint64_t)(&mdata->remote_info);
> + sge.length = sizeof mdata->remote_info;
> + sge.lkey = mdata->remote_info_mr->lkey;
> +
> + send_wr.wr_id = RDMA_WRID_SEND_REMOTE_INFO;
> + send_wr.opcode = IBV_WR_SEND;
> + send_wr.send_flags = IBV_SEND_SIGNALED;
> + send_wr.sg_list = &sge;
> + send_wr.num_sge = 1;
> +
> + if (ibv_post_send(mdata->rdma_ctx.qp, &send_wr, &bad_wr)) {
> + return -1;
> + }
> +
> + mdata->num_signaled_send--;
> + return 0;
> +}
> +
> +int rdma_post_recv_remote_info(
> + struct rdma_data *mdata)
> +{
> + struct ibv_sge sge;
> + struct ibv_recv_wr recv_wr = { 0 };
> + struct ibv_recv_wr *bad_wr;
> +
> + sge.addr = (uint64_t)(&mdata->remote_info);
> + sge.length = sizeof mdata->remote_info;
> + sge.lkey = mdata->remote_info_mr->lkey;
> +
> + recv_wr.wr_id = RDMA_WRID_RECV_REMOTE_INFO;
> + recv_wr.sg_list = &sge;
> + recv_wr.num_sge = 1;
> +
> + if (ibv_post_recv(mdata->rdma_ctx.qp, &recv_wr, &bad_wr)) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int rdma_write_actual(struct rdma_context *rdma_ctx,
> + struct rdma_ram_block *block,
> + uint64_t offset, uint64_t length,
> + uint64_t wr_id, enum ibv_send_flags flag)
> +{
> + struct ibv_sge sge;
> + struct ibv_send_wr send_wr = { 0 };
> + struct ibv_send_wr *bad_wr;
> +
> + sge.addr = (uint64_t)(block->local_host_addr + (offset - block->offset));
> + sge.length = length;
> + if (rdma_get_lkey(rdma_ctx, block, sge.addr, &sge.lkey)) {
> + fprintf(stderr, "cannot get lkey!\n");
> + return -1;
> + }
> + send_wr.wr_id = wr_id;
> + send_wr.opcode = IBV_WR_RDMA_WRITE;
> + send_wr.send_flags = flag;
> + send_wr.sg_list = &sge;
> + send_wr.num_sge = 1;
> + send_wr.wr.rdma.rkey = block->remote_rkey;
> + send_wr.wr.rdma.remote_addr = block->remote_host_addr +
> + (offset - block->offset);
> +
> + if (ibv_post_send(rdma_ctx->qp, &send_wr, &bad_wr)) {
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +int rdma_write_flush(struct rdma_data *mdata)
> +{
> + int ret;
> + enum ibv_send_flags flags = 0;
> +
> + if (!mdata->current_length) {
> + return 0;
> + }
> + if (mdata->num_unsignaled_send >=
> + RDMA_UNSIGNALED_SEND_MAX) {
> + flags = IBV_SEND_SIGNALED;
> + }
> + ret = rdma_write_actual(&mdata->rdma_ctx,
> + &(mdata->rdma_ram_blocks.block[mdata->current_index]),
> + mdata->current_offset,
> + mdata->current_length,
> + RDMA_WRID_RDMA, flags);
> +
> + if (ret) {
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: write flush error!\n");
> + }
> + return ret;
> + }
> +
> + if (mdata->num_unsignaled_send >=
> + RDMA_UNSIGNALED_SEND_MAX) {
> + mdata->num_unsignaled_send = 0;
> + mdata->num_signaled_send++;
> + } else {
> + mdata->num_unsignaled_send++;
> + }
> +
> + mdata->total_bytes += mdata->current_length;
> + mdata->current_length = 0;
> + mdata->current_offset = 0;
> +
> + return 0;
> +}
> +
> +static inline int rdma_in_current_block(
> + struct rdma_data *mdata,
> + uint64_t offset, uint64_t len)
> +{
> + struct rdma_ram_block *block =
> + &(mdata->rdma_ram_blocks.block[mdata->current_index]);
> + if (mdata->current_index < 0) {
> + return 0;
> + }
> + if (offset < block->offset) {
> + return 0;
> + }
> + if (offset + len > block->offset + block->length) {
> + return 0;
> + }
> + return 1;
> +}
> +
> +static inline int rdma_in_current_chunk(
> + struct rdma_data *mdata,
> + uint64_t offset, uint64_t len)
> +{
> + struct rdma_ram_block *block =
> + &(mdata->rdma_ram_blocks.block[mdata->current_index]);
> + uint8_t *chunk_start, *chunk_end, *host_addr;
> + if (mdata->current_chunk < 0) {
> + return 0;
> + }
> + host_addr = block->local_host_addr + (offset - block->offset);
> + chunk_start = RDMA_REG_CHUNK_START(block, mdata->current_chunk);
> + if (chunk_start < block->local_host_addr) {
> + chunk_start = block->local_host_addr;
> + }
> + if (host_addr < chunk_start) {
> + return 0;
> + }
> + chunk_end = RDMA_REG_CHUNK_END(block, mdata->current_chunk);
> + if (chunk_end > chunk_start + block->length) {
> + chunk_end = chunk_start + block->length;
> + }
> + if (host_addr + len > chunk_end) {
> + return 0;
> + }
> + return 1;
> +}
> +
> +static inline int rdma_buffer_mergable(
> + struct rdma_data *mdata,
> + uint64_t offset, uint64_t len)
> +{
> + if (mdata->current_length == 0) {
> + return 0;
> + }
> + if (offset != mdata->current_offset + mdata->current_length) {
> + return 0;
> + }
> + if (!rdma_in_current_block(mdata, offset, len)) {
> + return 0;
> + }
> +#ifdef RDMA_CHUNK_REGISTRATION
> + if (!rdma_in_current_chunk(mdata, offset, len)) {
> + return 0;
> + }
> +#endif
> + return 1;
> +}
> +
> +/* Note that buffer must be within a single block/chunk. */
> +int rdma_write(struct rdma_data *mdata,
> + uint64_t offset, uint64_t len)
> +{
> + int index = mdata->current_index;
> + int chunk_index = mdata->current_chunk;
> + int ret;
> +
> + /* If we cannot merge it, we flush the current buffer first. */
> + if (!rdma_buffer_mergable(mdata, offset, len)) {
> + ret = rdma_write_flush(mdata);
> + if (ret) {
> + return ret;
> + }
> + mdata->current_length = 0;
> + mdata->current_offset = offset;
> +
> + if (rdma_search_ram_block(offset, len,
> + &mdata->rdma_ram_blocks, &index, &chunk_index)) {
> + return -1;
> + }
> + mdata->current_index = index;
> + mdata->current_chunk = chunk_index;
> + }
> +
> + /* merge it */
> + mdata->current_length += len;
> +
> + /* flush it if buffer is too large */
> + if (mdata->current_length >= RDMA_MERGE_MAX) {
> + return rdma_write_flush(mdata);
> + }
> +
> + return 0;
> +}
> +
> +int rdma_poll(struct rdma_data *mdata)
> +{
> + int ret;
> + struct ibv_wc wc;
> +
> + ret = ibv_poll_cq(mdata->rdma_ctx.cq, 1, &wc);
> + if (!ret) {
> + return RDMA_WRID_NONE;
> + }
> + if (ret < 0) {
> + fprintf(stderr, "ibv_poll_cq return %d!\n", ret);
> + return ret;
> + }
> + if (wc.status != IBV_WC_SUCCESS) {
> + fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n",
> + wc.status, ibv_wc_status_str(wc.status));
> + fprintf(stderr, "ibv_poll_cq wrid=%"PRIu64"!\n", wc.wr_id);
> +
> + return -1;
> + }
> +
> + if (!(wc.opcode & IBV_WC_RECV)) {
> + mdata->num_signaled_send--;
> + }
> +
> + return (int)wc.wr_id;
> +}
> +
> +int rdma_wait_for_wrid(
> + struct rdma_data *mdata,
> + int wrid)
> +{
> +#ifdef RDMA_BLOCKING
> + return rdma_block_for_wrid(mdata, wrid);
> +#else
> + return rdma_poll_for_wrid(mdata, wrid);
> +#endif
> +}
> +
> +int rdma_poll_for_wrid(
> + struct rdma_data *mdata,
> + int wrid)
> +{
> + int r = RDMA_WRID_NONE;
> + while (r != wrid) {
> + r = rdma_poll(mdata);
> + if (r < 0) {
> + return r;
> + }
> + }
> + return 0;
> +}
> +
> +int rdma_block_for_wrid(
> + struct rdma_data *mdata,
> + int wrid)
> +{
> + int num_cq_events = 0;
> + int r = RDMA_WRID_NONE;
> + struct ibv_cq *cq;
> + void *cq_ctx;
> +
> + if (ibv_req_notify_cq(mdata->rdma_ctx.cq, 0)) {
> + return -1;
> + }
> + /* poll cq first */
> + while (r != wrid) {
> + r = rdma_poll(mdata);
> + if (r < 0) {
> + return r;
> + }
> + if (r == RDMA_WRID_NONE) {
> + break;
> + }
> + }
> + if (r == wrid) {
> + return 0;
> + }
> +
> + while (1) {
> + if (ibv_get_cq_event(mdata->rdma_ctx.comp_channel,
> + &cq, &cq_ctx)) {
> + goto err_block_for_wrid;
> + }
> + num_cq_events++;
> + if (ibv_req_notify_cq(cq, 0)) {
> + goto err_block_for_wrid;
> + }
> + /* poll cq */
> + while (r != wrid) {
> + r = rdma_poll(mdata);
> + if (r < 0) {
> + goto err_block_for_wrid;
> + }
> + if (r == RDMA_WRID_NONE) {
> + break;
> + }
> + }
> + if (r == wrid) {
> + goto success_block_for_wrid;
> + }
> + }
> +
> +success_block_for_wrid:
> + if (num_cq_events) {
> + ibv_ack_cq_events(cq, num_cq_events);
> + }
> + return 0;
> +
> +err_block_for_wrid:
> + if (num_cq_events) {
> + ibv_ack_cq_events(cq, num_cq_events);
> + }
> + return -1;
> +}
> +
> +void rdma_cleanup(struct rdma_data *mdata)
> +{
> + struct rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +
> + mdata->enabled = 0;
> + if (mdata->sync_mr) {
> + rdma_dereg_sync(mdata);
> + }
> + if (mdata->remote_info_mr) {
> + rdma_dereg_remote_info(mdata);
> + }
> + mdata->sync_mr = NULL;
> + mdata->remote_info_mr = NULL;
> + rdma_dereg_ram_blocks(&mdata->rdma_ram_blocks);
> + mdata->rdma_ram_blocks.num_blocks = 0;
> +
> + if (rdma_ctx->qp) {
> + ibv_destroy_qp(rdma_ctx->qp);
> + }
> + if (rdma_ctx->cq) {
> + ibv_destroy_cq(rdma_ctx->cq);
> + }
> + if (rdma_ctx->comp_channel) {
> + ibv_destroy_comp_channel(rdma_ctx->comp_channel);
> + }
> + if (rdma_ctx->pd) {
> + ibv_dealloc_pd(rdma_ctx->pd);
> + }
> + if (rdma_ctx->listen_id) {
> + rdma_destroy_id(rdma_ctx->listen_id);
> + }
> + if (rdma_ctx->cm_id) {
> + rdma_destroy_id(rdma_ctx->cm_id);
> + }
> + if (rdma_ctx->channel) {
> + rdma_destroy_event_channel(rdma_ctx->channel);
> + }
> +
> + rdma_data_init(mdata);
> +}
> +
> +int rdma_client_init(struct rdma_data *mdata)
> +{
> + int ret;
> +
> + if (mdata->client_init_done) {
> + return 0;
> + }
> +
> + ret = rdma_resolve_host(&mdata->rdma_ctx,
> + mdata->host, mdata->port);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error resolving host!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_alloc_pd_cq(&mdata->rdma_ctx);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_alloc_qp(&mdata->rdma_ctx);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error allocating qp!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_client_reg_ram_blocks(&mdata->rdma_ctx,
> + &mdata->rdma_ram_blocks);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering ram blocks!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_reg_sync(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering sync data!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_reg_remote_info(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering remote info!\n");
> + goto err_rdma_client_init;
> + }
> +
> + ret = rdma_post_recv_remote_info(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error posting remote info recv!\n");
> + goto err_rdma_client_init;
> + }
> +
> + mdata->client_init_done = 1;
> + DPRINTF("rdma_client_init success\n");
> + return 0;
> +
> +err_rdma_client_init:
> + rdma_cleanup(mdata);
> + return -1;
> +}
> +
> +int rdma_client_connect(struct rdma_data *mdata)
> +{
> + int ret;
> +
> + ret = rdma_migrate_connect(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error connecting!\n");
> + goto err_rdma_client_connect;
> + }
> +
> + /* wait for remote info */
> + ret = rdma_wait_for_wrid(&rdma_mdata,
> + RDMA_WRID_RECV_REMOTE_INFO);
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: polling remote info error!\n");
> + goto err_rdma_client_connect;
> + }
> +
> + ret = rdma_process_remote_ram_blocks(
> + &mdata->rdma_ram_blocks, &mdata->remote_info);
> + if (ret) {
> + fprintf(stderr,
> + "rdma migration: error processing remote ram blocks!\n");
> + goto err_rdma_client_connect;
> + }
> +
> + rdma_mdata.total_bytes = 0;
> + rdma_mdata.enabled = 1;
> + DPRINTF("rdma_client_connect success\n");
> + return 0;
> +
> +err_rdma_client_connect:
> + rdma_cleanup(mdata);
> + return -1;
> +}
> +
> +int rdma_start_incoming_migration(int s)
> +{
> + int ret = rdma_server_init(&rdma_mdata);
> + if (ret) {
> + return -1;
> + }
> +
> + ret = rdma_server_prepare(&rdma_mdata);
> + if (ret) {
> + return -1;
> + }
> + return 0;
> +}
> +
> +int rdma_server_init(struct rdma_data *mdata)
> +{
> +
> + int ret;
> + struct sockaddr_in sin;
> + struct rdma_cm_id *listen_id;
> + struct rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +
> + if(mdata->host == NULL) {
> + printf("Error: RDMA host is not set! port: %d\n", mdata->port);
> + return -1;
> + }
> + /* create CM channel */
> + rdma_ctx->channel = rdma_create_event_channel();
> + if (!rdma_ctx->channel) {
> + return -1;
> + }
> +
> + /* create CM id */
> + ret = rdma_create_id(rdma_ctx->channel, &listen_id, NULL,
> + RDMA_PS_TCP);
> + if (ret) {
> + goto err_server_init_create_listen_id;
> + }
> +
> + memset(&sin, 0, sizeof(sin));
> + sin.sin_family = AF_INET;
> + sin.sin_port = htons(mdata->port);
> +
> + if (mdata->host && strcmp("", mdata->host)) {
> + struct hostent *server_addr;
> + server_addr = gethostbyname(mdata->host);
> + if (!server_addr) {
> + goto err_server_init_bind_addr;
> + }
> + memcpy(&sin.sin_addr.s_addr, server_addr->h_addr,
> + server_addr->h_length);
> + } else {
> + sin.sin_addr.s_addr = INADDR_ANY;
> + }
> +
> + ret = rdma_bind_addr(listen_id, (struct sockaddr *)&sin);
> + if (ret) {
> + goto err_server_init_bind_addr;
> + }
> +
> + rdma_ctx->listen_id = listen_id;
> + if (listen_id->verbs) {
> + rdma_ctx->verbs = listen_id->verbs;
> + }
> + rdma_dump_id("server_init", rdma_ctx->verbs);
> + rdma_dump_gid("server_init", listen_id);
> + return 0;
> +
> +err_server_init_bind_addr:
> + rdma_destroy_id(listen_id);
> +err_server_init_create_listen_id:
> + rdma_destroy_event_channel(rdma_ctx->channel);
> + rdma_ctx->channel = NULL;
> + fprintf(stderr, "rdma migration: error init server!\n");
> +
> + return -1;
> +
> +}
> +
> +int rdma_server_prepare(struct rdma_data *mdata)
> +{
> + int ret;
> + struct rdma_context *rdma_ctx = &mdata->rdma_ctx;
> +
> + if (!rdma_ctx->verbs) {
> + return 0;
> + }
> +
> + ret = rdma_alloc_pd_cq(rdma_ctx);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error allocating pd and cq!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + ret = rdma_init_ram_blocks(&mdata->rdma_ram_blocks);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error initializing ram blocks!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + ret = rdma_server_reg_ram_blocks(rdma_ctx,
> + &mdata->rdma_ram_blocks);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering ram blocks!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + ret = rdma_reg_sync(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering sync data!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + rdma_copy_to_remote_ram_blocks(&mdata->rdma_ram_blocks,
> + &mdata->remote_info);
> +
> + ret = rdma_reg_remote_info(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error registering remote info!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + ret = rdma_listen(rdma_ctx->listen_id, 5);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error listening on socket!\n");
> + goto err_rdma_server_prepare;
> + }
> +
> + return 0;
> +
> +err_rdma_server_prepare:
> + rdma_cleanup(mdata);
> + fprintf(stderr, "rdma migration: error preparing server!\n");
> + return -1;
> +}
> +
> +int rdma_accept_incoming_migration(
> + struct rdma_data *mdata)
> +{
> +
> + int ret;
> +
> + ret = rdma_migrate_listen(mdata, mdata->host, mdata->port);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error listening!\n");
> + goto err_rdma_server_wait;
> + }
> +
> + ret = rdma_alloc_qp(&mdata->rdma_ctx);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error allocating qp!\n");
> + goto err_rdma_server_wait;
> + }
> +
> +#ifdef RDMA_EXTRA_SYNC
> + ret = rdma_post_recv_sync(mdata,
> + RDMA_WRID_RECV_EXTRA_SYNC);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error posting extra sync
> receive!\n");
> + goto err_rdma_server_wait;
> + }
> +#endif
> +
> + ret = rdma_post_recv_sync(mdata,
> + RDMA_WRID_RECV_SYNC);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error posting sync receive!\n");
> + goto err_rdma_server_wait;
> + }
> +
> + ret = rdma_migrate_accept(&mdata->rdma_ctx, NULL, NULL, NULL, 0);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error accepting connection!\n");
> + goto err_rdma_server_wait;
> + }
> +
> + /* send remote info */
> + ret = rdma_post_send_remote_info(mdata);
> + if (ret) {
> + fprintf(stderr, "rdma migration: error sending remote info!\n");
> + goto err_rdma_server_wait;
> + }
> +
> + /* wait for completion */
> + ret = rdma_wait_for_wrid(&rdma_mdata,
> + RDMA_WRID_SEND_REMOTE_INFO);
> + if (ret < 0) {
> + fprintf(stderr, "rdma migration: polling remote info error!\n");
> + goto err_rdma_server_wait;
> + }
> +
> + rdma_mdata.total_bytes = 0;
> + rdma_mdata.enabled = 1;
> + rdma_dump_gid("server_connect", mdata->rdma_ctx.cm_id);
> + return 0;
> +
> +err_rdma_server_wait:
> + fprintf(stderr, "rdma migration: error waiting for client!\n");
> + rdma_cleanup(mdata);
> + return -1;
> +
> +}
> +
> +void rdma_data_init(struct rdma_data *mdata)
> +{
> + rdma_init_context(&mdata->rdma_ctx);
> + printf("rdma port: %d\n", rdmaport);
> + printf("rdma host: %s\n", rdmahost);
> + mdata->port = rdmaport;
> + mdata->host = rdmahost;
> + mdata->enabled = 0;
> + mdata->rdma_ram_blocks.num_blocks = 0;
> + mdata->client_init_done = 0;
> + mdata->num_unsignaled_send = 0;
> + mdata->num_signaled_send = 0;
> + mdata->current_offset = 0;
> + mdata->current_length = 0;
> + mdata->current_index = -1;
> + mdata->current_chunk = -1;
> + mdata->sync = 0;
> + mdata->sync_mr = NULL;
> + mdata->remote_info_mr = NULL;
> + rdma_update_capability(migrate_get_current());
> +}
> +
> +void rdma_disable(void)
> +{
> + rdma_mdata.port = -1;
> + rdma_mdata.enabled = 0;
> +}
> --
> 1.7.10.4
>
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 4/6] initialize RDMA options when QEMU first runs on command-line, (continued)
[Qemu-devel] [RFC PATCH RDMA support v2: 2/6] create migration-rdma.c for core RDMA migration code, Michael R. Hines, 2013/02/11
[Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Michael R. Hines, 2013/02/11
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Orit Wasserman, 2013/02/13
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Michael R. Hines, 2013/02/14
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Orit Wasserman, 2013/02/18
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Michael R. Hines, 2013/02/19
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Orit Wasserman, 2013/02/19
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Michael R. Hines, 2013/02/19
- Re: [Qemu-devel] [RFC PATCH RDMA support v2: 5/6] connection-setup code between client/server, Paolo Bonzini, 2013/02/19