Bug Summary

File:migration-rdma.c
Location:line 699, column 57
Description:Dereference of null pointer

Annotated Source Code

1/*
2 * RDMA protocol and interfaces
3 *
4 * Copyright IBM, Corp. 2010-2013
5 *
6 * Authors:
7 * Michael R. Hines <mrhines@us.ibm.com>
8 * Jiuxing Liu <jl@us.ibm.com>
9 *
10 * This work is licensed under the terms of the GNU GPL, version 2 or
11 * later. See the COPYING file in the top-level directory.
12 *
13 */
14#include "qemu-common.h"
15#include "migration/migration.h"
16#include "migration/qemu-file.h"
17#include "exec/cpu-common.h"
18#include "qemu/main-loop.h"
19#include "qemu/sockets.h"
20#include "qemu/bitmap.h"
21#include "block/coroutine.h"
22#include <stdio.h>
23#include <sys/types.h>
24#include <sys/socket.h>
25#include <netdb.h>
26#include <arpa/inet.h>
27#include <string.h>
28#include <rdma/rdma_cma.h>
29
30//#define DEBUG_RDMA
31//#define DEBUG_RDMA_VERBOSE
32//#define DEBUG_RDMA_REALLY_VERBOSE
33
34#ifdef DEBUG_RDMA
35#define DPRINTF(fmt, ...)do { } while (0) \
36 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
37#else
38#define DPRINTF(fmt, ...)do { } while (0) \
39 do { } while (0)
40#endif
41
42#ifdef DEBUG_RDMA_VERBOSE
43#define DDPRINTF(fmt, ...)do { } while (0) \
44 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
45#else
46#define DDPRINTF(fmt, ...)do { } while (0) \
47 do { } while (0)
48#endif
49
50#ifdef DEBUG_RDMA_REALLY_VERBOSE
51#define DDDPRINTF(fmt, ...)do { } while (0) \
52 do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0)
53#else
54#define DDDPRINTF(fmt, ...)do { } while (0) \
55 do { } while (0)
56#endif
57
58/*
59 * Print and error on both the Monitor and the Log file.
60 */
61#define ERROR(errp, fmt, ...)do { fprintf(stderr, "RDMA ERROR: " fmt "\n", ...); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " fmt, ...); } } while (0)
\
62 do { \
63 fprintf(stderrstderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \
64 if (errp && (*(errp) == NULL((void*)0))) { \
65 error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__)error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " fmt
, ## __VA_ARGS__)
; \
66 } \
67 } while (0)
68
69#define RDMA_RESOLVE_TIMEOUT_MS10000 10000
70
71/* Do not merge data if larger than this. */
72#define RDMA_MERGE_MAX(2 * 1024 * 1024) (2 * 1024 * 1024)
73#define RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096) (RDMA_MERGE_MAX(2 * 1024 * 1024) / 4096)
74
75#define RDMA_REG_CHUNK_SHIFT20 20 /* 1 MB */
76
77/*
78 * This is only for non-live state being migrated.
79 * Instead of RDMA_WRITE messages, we use RDMA_SEND
80 * messages for that state, which requires a different
81 * delivery design than main memory.
82 */
83#define RDMA_SEND_INCREMENT32768 32768
84
85/*
86 * Maximum size infiniband SEND message
87 */
88#define RDMA_CONTROL_MAX_BUFFER(512 * 1024) (512 * 1024)
89#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096 4096
90
91#define RDMA_CONTROL_VERSION_CURRENT1 1
92/*
93 * Capabilities for negotiation.
94 */
95#define RDMA_CAPABILITY_PIN_ALL0x01 0x01
96
97/*
98 * Add the other flags above to this list of known capabilities
99 * as they are introduced.
100 */
101static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL0x01;
102
103#define CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
\
104 do { \
105 if (rdma->error_state) { \
106 if (!rdma->error_reported) { \
107 fprintf(stderrstderr, "RDMA is in an error state waiting migration" \
108 " to abort!\n"); \
109 rdma->error_reported = 1; \
110 } \
111 return rdma->error_state; \
112 } \
113 } while (0);
114
115/*
116 * A work request ID is 64-bits and we split up these bits
117 * into 3 parts:
118 *
119 * bits 0-15 : type of control message, 2^16
120 * bits 16-29: ram block index, 2^14
121 * bits 30-63: ram block chunk number, 2^34
122 *
123 * The last two bit ranges are only used for RDMA writes,
124 * in order to track their completion and potentially
125 * also track unregistration status of the message.
126 */
127#define RDMA_WRID_TYPE_SHIFT0UL 0UL
128#define RDMA_WRID_BLOCK_SHIFT16UL 16UL
129#define RDMA_WRID_CHUNK_SHIFT30UL 30UL
130
131#define RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL) \
132 ((1UL << RDMA_WRID_BLOCK_SHIFT16UL) - 1UL)
133
134#define RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
))
\
135 (~RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL) & ((1UL << RDMA_WRID_CHUNK_SHIFT30UL) - 1UL))
136
137#define RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
)) & ~((1UL << 16UL) - 1UL))
(~RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
))
& ~RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL))
138
139/*
140 * RDMA migration protocol:
141 * 1. RDMA Writes (data messages, i.e. RAM)
142 * 2. IB Send/Recv (control channel messages)
143 */
144enum {
145 RDMA_WRID_NONE = 0,
146 RDMA_WRID_RDMA_WRITE = 1,
147 RDMA_WRID_SEND_CONTROL = 2000,
148 RDMA_WRID_RECV_CONTROL = 4000,
149};
150
151const char *wrid_desc[] = {
152 [RDMA_WRID_NONE] = "NONE",
153 [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA",
154 [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND",
155 [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV",
156};
157
158/*
159 * Work request IDs for IB SEND messages only (not RDMA writes).
160 * This is used by the migration protocol to transmit
161 * control messages (such as device state and registration commands)
162 *
163 * We could use more WRs, but we have enough for now.
164 */
165enum {
166 RDMA_WRID_READY = 0,
167 RDMA_WRID_DATA,
168 RDMA_WRID_CONTROL,
169 RDMA_WRID_MAX,
170};
171
172/*
173 * SEND/RECV IB Control Messages.
174 */
175enum {
176 RDMA_CONTROL_NONE = 0,
177 RDMA_CONTROL_ERROR,
178 RDMA_CONTROL_READY, /* ready to receive */
179 RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */
180 RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */
181 RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */
182 RDMA_CONTROL_COMPRESS, /* page contains repeat values */
183 RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */
184 RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */
185 RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */
186 RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */
187 RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */
188};
189
190const char *control_desc[] = {
191 [RDMA_CONTROL_NONE] = "NONE",
192 [RDMA_CONTROL_ERROR] = "ERROR",
193 [RDMA_CONTROL_READY] = "READY",
194 [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE",
195 [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST",
196 [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT",
197 [RDMA_CONTROL_COMPRESS] = "COMPRESS",
198 [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST",
199 [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT",
200 [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED",
201 [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST",
202 [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED",
203};
204
205/*
206 * Memory and MR structures used to represent an IB Send/Recv work request.
207 * This is *not* used for RDMA writes, only IB Send/Recv.
208 */
209typedef struct {
210 uint8_t control[RDMA_CONTROL_MAX_BUFFER(512 * 1024)]; /* actual buffer to register */
211 struct ibv_mr *control_mr; /* registration metadata */
212 size_t control_len; /* length of the message */
213 uint8_t *control_curr; /* start of unconsumed bytes */
214} RDMAWorkRequestData;
215
216/*
217 * Negotiate RDMA capabilities during connection-setup time.
218 */
219typedef struct {
220 uint32_t version;
221 uint32_t flags;
222} RDMACapabilities;
223
224static void caps_to_network(RDMACapabilities *cap)
225{
226 cap->version = htonl(cap->version);
227 cap->flags = htonl(cap->flags);
228}
229
230static void network_to_caps(RDMACapabilities *cap)
231{
232 cap->version = ntohl(cap->version);
233 cap->flags = ntohl(cap->flags);
234}
235
236/*
237 * Representation of a RAMBlock from an RDMA perspective.
238 * This is not transmitted, only local.
239 * This and subsequent structures cannot be linked lists
240 * because we're using a single IB message to transmit
241 * the information. It's small anyway, so a list is overkill.
242 */
243typedef struct RDMALocalBlock {
244 uint8_t *local_host_addr; /* local virtual address */
245 uint64_t remote_host_addr; /* remote virtual address */
246 uint64_t offset;
247 uint64_t length;
248 struct ibv_mr **pmr; /* MRs for chunk-level registration */
249 struct ibv_mr *mr; /* MR for non-chunk-level registration */
250 uint32_t *remote_keys; /* rkeys for chunk-level registration */
251 uint32_t remote_rkey; /* rkeys for non-chunk-level registration */
252 int index; /* which block are we */
253 bool_Bool is_ram_block;
254 int nb_chunks;
255 unsigned long *transit_bitmap;
256 unsigned long *unregister_bitmap;
257} RDMALocalBlock;
258
259/*
260 * Also represents a RAMblock, but only on the dest.
261 * This gets transmitted by the dest during connection-time
262 * to the source VM and then is used to populate the
263 * corresponding RDMALocalBlock with
264 * the information needed to perform the actual RDMA.
265 */
266typedef struct QEMU_PACKED__attribute__((packed)) RDMARemoteBlock {
267 uint64_t remote_host_addr;
268 uint64_t offset;
269 uint64_t length;
270 uint32_t remote_rkey;
271 uint32_t padding;
272} RDMARemoteBlock;
273
274static uint64_t htonll(uint64_t v)
275{
276 union { uint32_t lv[2]; uint64_t llv; } u;
277 u.lv[0] = htonl(v >> 32);
278 u.lv[1] = htonl(v & 0xFFFFFFFFULL);
279 return u.llv;
280}
281
282static uint64_t ntohll(uint64_t v) {
283 union { uint32_t lv[2]; uint64_t llv; } u;
284 u.llv = v;
285 return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]);
286}
287
288static void remote_block_to_network(RDMARemoteBlock *rb)
289{
290 rb->remote_host_addr = htonll(rb->remote_host_addr);
291 rb->offset = htonll(rb->offset);
292 rb->length = htonll(rb->length);
293 rb->remote_rkey = htonl(rb->remote_rkey);
294}
295
296static void network_to_remote_block(RDMARemoteBlock *rb)
297{
298 rb->remote_host_addr = ntohll(rb->remote_host_addr);
299 rb->offset = ntohll(rb->offset);
300 rb->length = ntohll(rb->length);
301 rb->remote_rkey = ntohl(rb->remote_rkey);
302}
303
304/*
305 * Virtual address of the above structures used for transmitting
306 * the RAMBlock descriptions at connection-time.
307 * This structure is *not* transmitted.
308 */
309typedef struct RDMALocalBlocks {
310 int nb_blocks;
311 bool_Bool init; /* main memory init complete */
312 RDMALocalBlock *block;
313} RDMALocalBlocks;
314
315/*
316 * Main data structure for RDMA state.
317 * While there is only one copy of this structure being allocated right now,
318 * this is the place where one would start if you wanted to consider
319 * having more than one RDMA connection open at the same time.
320 */
321typedef struct RDMAContext {
322 char *host;
323 int port;
324
325 RDMAWorkRequestData wr_data[RDMA_WRID_MAX];
326
327 /*
328 * This is used by *_exchange_send() to figure out whether or not
329 * the initial "READY" message has already been received or not.
330 * This is because other functions may potentially poll() and detect
331 * the READY message before send() does, in which case we need to
332 * know if it completed.
333 */
334 int control_ready_expected;
335
336 /* number of outstanding writes */
337 int nb_sent;
338
339 /* store info about current buffer so that we can
340 merge it with future sends */
341 uint64_t current_addr;
342 uint64_t current_length;
343 /* index of ram block the current buffer belongs to */
344 int current_index;
345 /* index of the chunk in the current ram block */
346 int current_chunk;
347
348 bool_Bool pin_all;
349
350 /*
351 * infiniband-specific variables for opening the device
352 * and maintaining connection state and so forth.
353 *
354 * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in
355 * cm_id->verbs, cm_id->channel, and cm_id->qp.
356 */
357 struct rdma_cm_id *cm_id; /* connection manager ID */
358 struct rdma_cm_id *listen_id;
359 bool_Bool connected;
360
361 struct ibv_context *verbs;
362 struct rdma_event_channel *channel;
363 struct ibv_qp *qp; /* queue pair */
364 struct ibv_comp_channel *comp_channel; /* completion channel */
365 struct ibv_pd *pd; /* protection domain */
366 struct ibv_cq *cq; /* completion queue */
367
368 /*
369 * If a previous write failed (perhaps because of a failed
370 * memory registration, then do not attempt any future work
371 * and remember the error state.
372 */
373 int error_state;
374 int error_reported;
375
376 /*
377 * Description of ram blocks used throughout the code.
378 */
379 RDMALocalBlocks local_ram_blocks;
380 RDMARemoteBlock *block;
381
382 /*
383 * Migration on *destination* started.
384 * Then use coroutine yield function.
385 * Source runs in a thread, so we don't care.
386 */
387 int migration_started_on_destination;
388
389 int total_registrations;
390 int total_writes;
391
392 int unregister_current, unregister_next;
393 uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)];
394
395 GHashTable *blockmap;
396} RDMAContext;
397
398/*
399 * Interface to the rest of the migration call stack.
400 */
401typedef struct QEMUFileRDMA {
402 RDMAContext *rdma;
403 size_t len;
404 void *file;
405} QEMUFileRDMA;
406
407/*
408 * Main structure for IB Send/Recv control messages.
409 * This gets prepended at the beginning of every Send/Recv.
410 */
411typedef struct QEMU_PACKED__attribute__((packed)) {
412 uint32_t len; /* Total length of data portion */
413 uint32_t type; /* which control command to perform */
414 uint32_t repeat; /* number of commands in data portion of same type */
415 uint32_t padding;
416} RDMAControlHeader;
417
418static void control_to_network(RDMAControlHeader *control)
419{
420 control->type = htonl(control->type);
421 control->len = htonl(control->len);
422 control->repeat = htonl(control->repeat);
423}
424
425static void network_to_control(RDMAControlHeader *control)
426{
427 control->type = ntohl(control->type);
428 control->len = ntohl(control->len);
429 control->repeat = ntohl(control->repeat);
430}
431
432/*
433 * Register a single Chunk.
434 * Information sent by the source VM to inform the dest
435 * to register an single chunk of memory before we can perform
436 * the actual RDMA operation.
437 */
438typedef struct QEMU_PACKED__attribute__((packed)) {
439 union QEMU_PACKED__attribute__((packed)) {
440 uint64_t current_addr; /* offset into the ramblock of the chunk */
441 uint64_t chunk; /* chunk to lookup if unregistering */
442 } key;
443 uint32_t current_index; /* which ramblock the chunk belongs to */
444 uint32_t padding;
445 uint64_t chunks; /* how many sequential chunks to register */
446} RDMARegister;
447
448static void register_to_network(RDMARegister *reg)
449{
450 reg->key.current_addr = htonll(reg->key.current_addr);
451 reg->current_index = htonl(reg->current_index);
452 reg->chunks = htonll(reg->chunks);
453}
454
455static void network_to_register(RDMARegister *reg)
456{
457 reg->key.current_addr = ntohll(reg->key.current_addr);
458 reg->current_index = ntohl(reg->current_index);
459 reg->chunks = ntohll(reg->chunks);
460}
461
462typedef struct QEMU_PACKED__attribute__((packed)) {
463 uint32_t value; /* if zero, we will madvise() */
464 uint32_t block_idx; /* which ram block index */
465 uint64_t offset; /* where in the remote ramblock this chunk */
466 uint64_t length; /* length of the chunk */
467} RDMACompress;
468
469static void compress_to_network(RDMACompress *comp)
470{
471 comp->value = htonl(comp->value);
472 comp->block_idx = htonl(comp->block_idx);
473 comp->offset = htonll(comp->offset);
474 comp->length = htonll(comp->length);
475}
476
477static void network_to_compress(RDMACompress *comp)
478{
479 comp->value = ntohl(comp->value);
480 comp->block_idx = ntohl(comp->block_idx);
481 comp->offset = ntohll(comp->offset);
482 comp->length = ntohll(comp->length);
483}
484
485/*
486 * The result of the dest's memory registration produces an "rkey"
487 * which the source VM must reference in order to perform
488 * the RDMA operation.
489 */
490typedef struct QEMU_PACKED__attribute__((packed)) {
491 uint32_t rkey;
492 uint32_t padding;
493 uint64_t host_addr;
494} RDMARegisterResult;
495
496static void result_to_network(RDMARegisterResult *result)
497{
498 result->rkey = htonl(result->rkey);
499 result->host_addr = htonll(result->host_addr);
500};
501
502static void network_to_result(RDMARegisterResult *result)
503{
504 result->rkey = ntohl(result->rkey);
505 result->host_addr = ntohll(result->host_addr);
506};
507
508const char *print_wrid(int wrid);
509static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
510 uint8_t *data, RDMAControlHeader *resp,
511 int *resp_idx,
512 int (*callback)(RDMAContext *rdma));
513
514static inline uint64_t ram_chunk_index(const uint8_t *start,
515 const uint8_t *host)
516{
517 return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT20;
518}
519
520static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block,
521 uint64_t i)
522{
523 return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr)
524 + (i << RDMA_REG_CHUNK_SHIFT20));
525}
526
527static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block,
528 uint64_t i)
529{
530 uint8_t *result = ram_chunk_start(rdma_ram_block, i) +
531 (1UL << RDMA_REG_CHUNK_SHIFT20);
532
533 if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) {
534 result = rdma_ram_block->local_host_addr + rdma_ram_block->length;
535 }
536
537 return result;
538}
539
540static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr,
541 ram_addr_t block_offset, uint64_t length)
542{
543 RDMALocalBlocks *local = &rdma->local_ram_blocks;
544 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
545 (void *) block_offset);
546 RDMALocalBlock *old = local->block;
547
548 assert(block == NULL)((block == ((void*)0)) ? (void) (0) : __assert_fail ("block == ((void*)0)"
, "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 548
, __PRETTY_FUNCTION__))
;
549
550 local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1));
551
552 if (local->nb_blocks) {
553 int x;
554
555 for (x = 0; x < local->nb_blocks; x++) {
556 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
557 g_hash_table_insert(rdma->blockmap, (void *)old[x].offset,
558 &local->block[x]);
559 }
560 memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks);
561 g_free(old);
562 }
563
564 block = &local->block[local->nb_blocks];
565
566 block->local_host_addr = host_addr;
567 block->offset = block_offset;
568 block->length = length;
569 block->index = local->nb_blocks;
570 block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL;
571 block->transit_bitmap = bitmap_new(block->nb_chunks);
572 bitmap_clear(block->transit_bitmap, 0, block->nb_chunks);
573 block->unregister_bitmap = bitmap_new(block->nb_chunks);
574 bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks);
575 block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t));
576
577 block->is_ram_block = local->init ? false0 : true1;
578
579 g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
580
581 DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64do { } while (0)
582 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",do { } while (0)
583 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,do { } while (0)
584 block->length, (uint64_t) (block->local_host_addr + block->length),do { } while (0)
585 BITS_TO_LONGS(block->nb_chunks) *do { } while (0)
586 sizeof(unsigned long) * 8, block->nb_chunks)do { } while (0);
587
588 local->nb_blocks++;
589
590 return 0;
591}
592
593/*
594 * Memory regions need to be registered with the device and queue pairs setup
595 * in advanced before the migration starts. This tells us where the RAM blocks
596 * are so that we can register them individually.
597 */
598static void qemu_rdma_init_one_block(void *host_addr,
599 ram_addr_t block_offset, ram_addr_t length, void *opaque)
600{
601 __qemu_rdma_add_block(opaque, host_addr, block_offset, length);
602}
603
604/*
605 * Identify the RAMBlocks and their quantity. They will be references to
606 * identify chunk boundaries inside each RAMBlock and also be referenced
607 * during dynamic page registration.
608 */
609static int qemu_rdma_init_ram_blocks(RDMAContext *rdma)
610{
611 RDMALocalBlocks *local = &rdma->local_ram_blocks;
612
613 assert(rdma->blockmap == NULL)((rdma->blockmap == ((void*)0)) ? (void) (0) : __assert_fail
("rdma->blockmap == ((void*)0)", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c"
, 613, __PRETTY_FUNCTION__))
;
614 rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal);
615 memset(local, 0, sizeof *local);
616 qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma);
617 DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks)do { } while (0);
618 rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) *
619 rdma->local_ram_blocks.nb_blocks);
620 local->init = true1;
621 return 0;
622}
623
624static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset)
625{
626 RDMALocalBlocks *local = &rdma->local_ram_blocks;
627 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
628 (void *) block_offset);
629 RDMALocalBlock *old = local->block;
630 int x;
631
632 assert(block)((block) ? (void) (0) : __assert_fail ("block", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c"
, 632, __PRETTY_FUNCTION__))
;
633
634 if (block->pmr) {
22
Taking false branch
635 int j;
636
637 for (j = 0; j < block->nb_chunks; j++) {
638 if (!block->pmr[j]) {
639 continue;
640 }
641 ibv_dereg_mr(block->pmr[j]);
642 rdma->total_registrations--;
643 }
644 g_free(block->pmr);
645 block->pmr = NULL((void*)0);
646 }
647
648 if (block->mr) {
23
Taking false branch
649 ibv_dereg_mr(block->mr);
650 rdma->total_registrations--;
651 block->mr = NULL((void*)0);
652 }
653
654 g_free(block->transit_bitmap);
655 block->transit_bitmap = NULL((void*)0);
656
657 g_free(block->unregister_bitmap);
658 block->unregister_bitmap = NULL((void*)0);
659
660 g_free(block->remote_keys);
661 block->remote_keys = NULL((void*)0);
662
663 for (x = 0; x < local->nb_blocks; x++) {
24
Loop condition is false. Execution continues on line 667
664 g_hash_table_remove(rdma->blockmap, (void *)old[x].offset);
665 }
666
667 if (local->nb_blocks > 1) {
25
Taking false branch
668
669 local->block = g_malloc0(sizeof(RDMALocalBlock) *
670 (local->nb_blocks - 1));
671
672 if (block->index) {
673 memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index);
674 }
675
676 if (block->index < (local->nb_blocks - 1)) {
677 memcpy(local->block + block->index, old + (block->index + 1),
678 sizeof(RDMALocalBlock) *
679 (local->nb_blocks - (block->index + 1)));
680 }
681 } else {
682 assert(block == local->block)((block == local->block) ? (void) (0) : __assert_fail ("block == local->block"
, "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 682
, __PRETTY_FUNCTION__))
;
683 local->block = NULL((void*)0);
26
Null pointer value stored to field 'block'
684 }
685
686 DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64do { } while (0)
687 " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",do { } while (0)
688 local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,do { } while (0)
689 block->length, (uint64_t) (block->local_host_addr + block->length),do { } while (0)
690 BITS_TO_LONGS(block->nb_chunks) *do { } while (0)
691 sizeof(unsigned long) * 8, block->nb_chunks)do { } while (0);
692
693 g_free(old);
694
695 local->nb_blocks--;
696
697 if (local->nb_blocks) {
27
Taking true branch
698 for (x = 0; x < local->nb_blocks; x++) {
28
Loop condition is true. Entering loop body
699 g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset,
29
Dereference of null pointer
700 &local->block[x]);
701 }
702 }
703
704 return 0;
705}
706
707/*
708 * Put in the log file which RDMA device was opened and the details
709 * associated with that device.
710 */
711static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs)
712{
713 struct ibv_port_attr port;
714
715 if (ibv_query_port(verbs, 1, &port)___ibv_query_port(verbs, 1, &port)) {
716 fprintf(stderrstderr, "FAILED TO QUERY PORT INFORMATION!\n");
717 return;
718 }
719
720 printf("%s RDMA Device opened: kernel name %s "
721 "uverbs device name %s, "
722 "infiniband_verbs class device path %s, "
723 "infiniband class device path %s, "
724 "transport: (%d) %s\n",
725 who,
726 verbs->device->name,
727 verbs->device->dev_name,
728 verbs->device->dev_path,
729 verbs->device->ibdev_path,
730 port.link_layer,
731 (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" :
732 ((port.link_layer == IBV_LINK_LAYER_ETHERNET)
733 ? "Ethernet" : "Unknown"));
734}
735
736/*
737 * Put in the log file the RDMA gid addressing information,
738 * useful for folks who have trouble understanding the
739 * RDMA device hierarchy in the kernel.
740 */
741static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id)
742{
743 char sgid[33];
744 char dgid[33];
745 inet_ntop(AF_INET610, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid);
746 inet_ntop(AF_INET610, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid);
747 DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid)do { } while (0);
748}
749
750/*
751 * As of now, IPv6 over RoCE / iWARP is not supported by linux.
752 * We will try the next addrinfo struct, and fail if there are
753 * no other valid addresses to bind against.
754 *
755 * If user is listening on '[::]', then we will not have a opened a device
756 * yet and have no way of verifying if the device is RoCE or not.
757 *
758 * In this case, the source VM will throw an error for ALL types of
759 * connections (both IPv4 and IPv6) if the destination machine does not have
760 * a regular infiniband network available for use.
761 *
762 * The only way to guarantee that an error is thrown for broken kernels is
763 * for the management software to choose a *specific* interface at bind time
764 * and validate what time of hardware it is.
765 *
766 * Unfortunately, this puts the user in a fix:
767 *
768 * If the source VM connects with an IPv4 address without knowing that the
769 * destination has bound to '[::]' the migration will unconditionally fail
770 * unless the management software is explicitly listening on the the IPv4
771 * address while using a RoCE-based device.
772 *
773 * If the source VM connects with an IPv6 address, then we're OK because we can
774 * throw an error on the source (and similarly on the destination).
775 *
776 * But in mixed environments, this will be broken for a while until it is fixed
777 * inside linux.
778 *
779 * We do provide a *tiny* bit of help in this function: We can list all of the
780 * devices in the system and check to see if all the devices are RoCE or
781 * Infiniband.
782 *
783 * If we detect that we have a *pure* RoCE environment, then we can safely
784 * thrown an error even if the management software has specified '[::]' as the
785 * bind address.
786 *
787 * However, if there is are multiple hetergeneous devices, then we cannot make
788 * this assumption and the user just has to be sure they know what they are
789 * doing.
790 *
791 * Patches are being reviewed on linux-rdma.
792 */
793static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs)
794{
795 struct ibv_port_attr port_attr;
796
797 /* This bug only exists in linux, to our knowledge. */
798#ifdef CONFIG_LINUX1
799
800 /*
801 * Verbs are only NULL if management has bound to '[::]'.
802 *
803 * Let's iterate through all the devices and see if there any pure IB
804 * devices (non-ethernet).
805 *
806 * If not, then we can safely proceed with the migration.
807 * Otherwise, there are no guarantees until the bug is fixed in linux.
808 */
809 if (!verbs) {
810 int num_devices, x;
811 struct ibv_device ** dev_list = ibv_get_device_list(&num_devices);
812 bool_Bool roce_found = false0;
813 bool_Bool ib_found = false0;
814
815 for (x = 0; x < num_devices; x++) {
816 verbs = ibv_open_device(dev_list[x]);
817
818 if (ibv_query_port(verbs, 1, &port_attr)___ibv_query_port(verbs, 1, &port_attr)) {
819 ibv_close_device(verbs);
820 ERROR(errp, "Could not query initial IB port")do { fprintf(stderr, "RDMA ERROR: " "Could not query initial IB port"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Could not query initial IB port"
); } } while (0)
;
821 return -EINVAL22;
822 }
823
824 if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) {
825 ib_found = true1;
826 } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
827 roce_found = true1;
828 }
829
830 ibv_close_device(verbs);
831
832 }
833
834 if (roce_found) {
835 if (ib_found) {
836 fprintf(stderrstderr, "WARN: migrations may fail:"
837 " IPv6 over RoCE / iWARP in linux"
838 " is broken. But since you appear to have a"
839 " mixed RoCE / IB environment, be sure to only"
840 " migrate over the IB fabric until the kernel "
841 " fixes the bug.\n");
842 } else {
843 ERROR(errp, "You only have RoCE / iWARP devices in your systems"do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
); } } while (0)
844 " and your management software has specified '[::]'"do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
); } } while (0)
845 ", but IPv6 over RoCE / iWARP is not supported in Linux.")do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems"
" and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux."
); } } while (0)
;
846 return -ENONET64;
847 }
848 }
849
850 return 0;
851 }
852
853 /*
854 * If we have a verbs context, that means that some other than '[::]' was
855 * used by the management software for binding. In which case we can actually
856 * warn the user about a potential broken kernel;
857 */
858
859 /* IB ports start with 1, not 0 */
860 if (ibv_query_port(verbs, 1, &port_attr)___ibv_query_port(verbs, 1, &port_attr)) {
861 ERROR(errp, "Could not query initial IB port")do { fprintf(stderr, "RDMA ERROR: " "Could not query initial IB port"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Could not query initial IB port"
); } } while (0)
;
862 return -EINVAL22;
863 }
864
865 if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) {
866 ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "do { fprintf(stderr, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 "
"(but patches on linux-rdma in progress)" "\n"); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 "
"(but patches on linux-rdma in progress)"); } } while (0)
867 "(but patches on linux-rdma in progress)")do { fprintf(stderr, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 "
"(but patches on linux-rdma in progress)" "\n"); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 "
"(but patches on linux-rdma in progress)"); } } while (0)
;
868 return -ENONET64;
869 }
870
871#endif
872
873 return 0;
874}
875
876/*
877 * Figure out which RDMA device corresponds to the requested IP hostname
878 * Also create the initial connection manager identifiers for opening
879 * the connection.
880 */
881static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp)
882{
883 int ret;
884 struct rdma_addrinfo *res;
885 char port_str[16];
886 struct rdma_cm_event *cm_event;
887 char ip[40] = "unknown";
888 struct rdma_addrinfo *e;
889
890 if (rdma->host == NULL((void*)0) || !strcmp(rdma->host, "")) {
891 ERROR(errp, "RDMA hostname has not been set")do { fprintf(stderr, "RDMA ERROR: " "RDMA hostname has not been set"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA hostname has not been set"
); } } while (0)
;
892 return -EINVAL22;
893 }
894
895 /* create CM channel */
896 rdma->channel = rdma_create_event_channel();
897 if (!rdma->channel) {
898 ERROR(errp, "could not create CM channel")do { fprintf(stderr, "RDMA ERROR: " "could not create CM channel"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create CM channel"
); } } while (0)
;
899 return -EINVAL22;
900 }
901
902 /* create CM id */
903 ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL((void*)0), RDMA_PS_TCP);
904 if (ret) {
905 ERROR(errp, "could not create channel id")do { fprintf(stderr, "RDMA ERROR: " "could not create channel id"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create channel id"
); } } while (0)
;
906 goto err_resolve_create_id;
907 }
908
909 snprintf(port_str, 16, "%d", rdma->port);
910 port_str[15] = '\0';
911
912 ret = rdma_getaddrinfo(rdma->host, port_str, NULL((void*)0), &res);
913 if (ret < 0) {
914 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not rdma_getaddrinfo address %s"
"\n", rdma->host); if (errp && (*(errp) == ((void
*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: "
"could not rdma_getaddrinfo address %s", rdma->host); } }
while (0)
;
915 goto err_resolve_get_addr;
916 }
917
918 for (e = res; e != NULL((void*)0); e = e->ai_next) {
919 inet_ntop(e->ai_family,
920 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
921 DPRINTF("Trying %s => %s\n", rdma->host, ip)do { } while (0);
922
923 ret = rdma_resolve_addr(rdma->cm_id, NULL((void*)0), e->ai_dst_addr,
924 RDMA_RESOLVE_TIMEOUT_MS10000);
925 if (!ret) {
926 if (e->ai_family == AF_INET610) {
927 ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs);
928 if (ret) {
929 continue;
930 }
931 }
932 goto route;
933 }
934 }
935
936 ERROR(errp, "could not resolve address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not resolve address %s"
"\n", rdma->host); if (errp && (*(errp) == ((void
*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: "
"could not resolve address %s", rdma->host); } } while (0
)
;
937 goto err_resolve_get_addr;
938
939route:
940 qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id);
941
942 ret = rdma_get_cm_event(rdma->channel, &cm_event);
943 if (ret) {
944 ERROR(errp, "could not perform event_addr_resolved")do { fprintf(stderr, "RDMA ERROR: " "could not perform event_addr_resolved"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not perform event_addr_resolved"
); } } while (0)
;
945 goto err_resolve_get_addr;
946 }
947
948 if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) {
949 ERROR(errp, "result not equal to event_addr_resolved %s",do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_addr_resolved %s"
"\n", rdma_event_str(cm_event->event)); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "result not equal to event_addr_resolved %s"
, rdma_event_str(cm_event->event)); } } while (0)
950 rdma_event_str(cm_event->event))do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_addr_resolved %s"
"\n", rdma_event_str(cm_event->event)); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "result not equal to event_addr_resolved %s"
, rdma_event_str(cm_event->event)); } } while (0)
;
951 perror("rdma_resolve_addr");
952 ret = -EINVAL22;
953 goto err_resolve_get_addr;
954 }
955 rdma_ack_cm_event(cm_event);
956
957 /* resolve route */
958 ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS10000);
959 if (ret) {
960 ERROR(errp, "could not resolve rdma route")do { fprintf(stderr, "RDMA ERROR: " "could not resolve rdma route"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not resolve rdma route"
); } } while (0)
;
961 goto err_resolve_get_addr;
962 }
963
964 ret = rdma_get_cm_event(rdma->channel, &cm_event);
965 if (ret) {
966 ERROR(errp, "could not perform event_route_resolved")do { fprintf(stderr, "RDMA ERROR: " "could not perform event_route_resolved"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not perform event_route_resolved"
); } } while (0)
;
967 goto err_resolve_get_addr;
968 }
969 if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) {
970 ERROR(errp, "result not equal to event_route_resolved: %s",do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_route_resolved: %s"
"\n", rdma_event_str(cm_event->event)); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "result not equal to event_route_resolved: %s"
, rdma_event_str(cm_event->event)); } } while (0)
971 rdma_event_str(cm_event->event))do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_route_resolved: %s"
"\n", rdma_event_str(cm_event->event)); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "result not equal to event_route_resolved: %s"
, rdma_event_str(cm_event->event)); } } while (0)
;
972 rdma_ack_cm_event(cm_event);
973 ret = -EINVAL22;
974 goto err_resolve_get_addr;
975 }
976 rdma_ack_cm_event(cm_event);
977 rdma->verbs = rdma->cm_id->verbs;
978 qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs);
979 qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id);
980 return 0;
981
982err_resolve_get_addr:
983 rdma_destroy_id(rdma->cm_id);
984 rdma->cm_id = NULL((void*)0);
985err_resolve_create_id:
986 rdma_destroy_event_channel(rdma->channel);
987 rdma->channel = NULL((void*)0);
988 return ret;
989}
990
991/*
992 * Create protection domain and completion queues
993 */
994static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma)
995{
996 /* allocate pd */
997 rdma->pd = ibv_alloc_pd(rdma->verbs);
998 if (!rdma->pd) {
999 fprintf(stderrstderr, "failed to allocate protection domain\n");
1000 return -1;
1001 }
1002
1003 /* create completion channel */
1004 rdma->comp_channel = ibv_create_comp_channel(rdma->verbs);
1005 if (!rdma->comp_channel) {
1006 fprintf(stderrstderr, "failed to allocate completion channel\n");
1007 goto err_alloc_pd_cq;
1008 }
1009
1010 /*
1011 * Completion queue can be filled by both read and write work requests,
1012 * so must reflect the sum of both possible queue sizes.
1013 */
1014 rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096) * 3),
1015 NULL((void*)0), rdma->comp_channel, 0);
1016 if (!rdma->cq) {
1017 fprintf(stderrstderr, "failed to allocate completion queue\n");
1018 goto err_alloc_pd_cq;
1019 }
1020
1021 return 0;
1022
1023err_alloc_pd_cq:
1024 if (rdma->pd) {
1025 ibv_dealloc_pd(rdma->pd);
1026 }
1027 if (rdma->comp_channel) {
1028 ibv_destroy_comp_channel(rdma->comp_channel);
1029 }
1030 rdma->pd = NULL((void*)0);
1031 rdma->comp_channel = NULL((void*)0);
1032 return -1;
1033
1034}
1035
1036/*
1037 * Create queue pairs.
1038 */
1039static int qemu_rdma_alloc_qp(RDMAContext *rdma)
1040{
1041 struct ibv_qp_init_attr attr = { 0 };
1042 int ret;
1043
1044 attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096);
1045 attr.cap.max_recv_wr = 3;
1046 attr.cap.max_send_sge = 1;
1047 attr.cap.max_recv_sge = 1;
1048 attr.send_cq = rdma->cq;
1049 attr.recv_cq = rdma->cq;
1050 attr.qp_type = IBV_QPT_RC;
1051
1052 ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr);
1053 if (ret) {
1054 return -1;
1055 }
1056
1057 rdma->qp = rdma->cm_id->qp;
1058 return 0;
1059}
1060
1061static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma)
1062{
1063 int i;
1064 RDMALocalBlocks *local = &rdma->local_ram_blocks;
1065
1066 for (i = 0; i < local->nb_blocks; i++) {
1067 local->block[i].mr =
1068 ibv_reg_mr(rdma->pd,
1069 local->block[i].local_host_addr,
1070 local->block[i].length,
1071 IBV_ACCESS_LOCAL_WRITE |
1072 IBV_ACCESS_REMOTE_WRITE
1073 );
1074 if (!local->block[i].mr) {
1075 perror("Failed to register local dest ram block!\n");
1076 break;
1077 }
1078 rdma->total_registrations++;
1079 }
1080
1081 if (i >= local->nb_blocks) {
1082 return 0;
1083 }
1084
1085 for (i--; i >= 0; i--) {
1086 ibv_dereg_mr(local->block[i].mr);
1087 rdma->total_registrations--;
1088 }
1089
1090 return -1;
1091
1092}
1093
1094/*
1095 * Find the ram block that corresponds to the page requested to be
1096 * transmitted by QEMU.
1097 *
1098 * Once the block is found, also identify which 'chunk' within that
1099 * block that the page belongs to.
1100 *
1101 * This search cannot fail or the migration will fail.
1102 */
1103static int qemu_rdma_search_ram_block(RDMAContext *rdma,
1104 uint64_t block_offset,
1105 uint64_t offset,
1106 uint64_t length,
1107 uint64_t *block_index,
1108 uint64_t *chunk_index)
1109{
1110 uint64_t current_addr = block_offset + offset;
1111 RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap,
1112 (void *) block_offset);
1113 assert(block)((block) ? (void) (0) : __assert_fail ("block", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c"
, 1113, __PRETTY_FUNCTION__))
;
1114 assert(current_addr >= block->offset)((current_addr >= block->offset) ? (void) (0) : __assert_fail
("current_addr >= block->offset", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c"
, 1114, __PRETTY_FUNCTION__))
;
1115 assert((current_addr + length) <= (block->offset + block->length))(((current_addr + length) <= (block->offset + block->
length)) ? (void) (0) : __assert_fail ("(current_addr + length) <= (block->offset + block->length)"
, "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 1115
, __PRETTY_FUNCTION__))
;
1116
1117 *block_index = block->index;
1118 *chunk_index = ram_chunk_index(block->local_host_addr,
1119 block->local_host_addr + (current_addr - block->offset));
1120
1121 return 0;
1122}
1123
1124/*
1125 * Register a chunk with IB. If the chunk was already registered
1126 * previously, then skip.
1127 *
1128 * Also return the keys associated with the registration needed
1129 * to perform the actual RDMA operation.
1130 */
1131static int qemu_rdma_register_and_get_keys(RDMAContext *rdma,
1132 RDMALocalBlock *block, uint8_t *host_addr,
1133 uint32_t *lkey, uint32_t *rkey, int chunk,
1134 uint8_t *chunk_start, uint8_t *chunk_end)
1135{
1136 if (block->mr) {
1137 if (lkey) {
1138 *lkey = block->mr->lkey;
1139 }
1140 if (rkey) {
1141 *rkey = block->mr->rkey;
1142 }
1143 return 0;
1144 }
1145
1146 /* allocate memory to store chunk MRs */
1147 if (!block->pmr) {
1148 block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *));
1149 if (!block->pmr) {
1150 return -1;
1151 }
1152 }
1153
1154 /*
1155 * If 'rkey', then we're the destination, so grant access to the source.
1156 *
1157 * If 'lkey', then we're the source VM, so grant access only to ourselves.
1158 */
1159 if (!block->pmr[chunk]) {
1160 uint64_t len = chunk_end - chunk_start;
1161
1162 DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",do { } while (0)
1163 len, chunk_start)do { } while (0);
1164
1165 block->pmr[chunk] = ibv_reg_mr(rdma->pd,
1166 chunk_start, len,
1167 (rkey ? (IBV_ACCESS_LOCAL_WRITE |
1168 IBV_ACCESS_REMOTE_WRITE) : 0));
1169
1170 if (!block->pmr[chunk]) {
1171 perror("Failed to register chunk!");
1172 fprintf(stderrstderr, "Chunk details: block: %d chunk index %d"
1173 " start %" PRIu64"l" "u" " end %" PRIu64"l" "u" " host %" PRIu64"l" "u"
1174 " local %" PRIu64"l" "u" " registrations: %d\n",
1175 block->index, chunk, (uint64_t) chunk_start,
1176 (uint64_t) chunk_end, (uint64_t) host_addr,
1177 (uint64_t) block->local_host_addr,
1178 rdma->total_registrations);
1179 return -1;
1180 }
1181 rdma->total_registrations++;
1182 }
1183
1184 if (lkey) {
1185 *lkey = block->pmr[chunk]->lkey;
1186 }
1187 if (rkey) {
1188 *rkey = block->pmr[chunk]->rkey;
1189 }
1190 return 0;
1191}
1192
1193/*
1194 * Register (at connection time) the memory used for control
1195 * channel messages.
1196 */
1197static int qemu_rdma_reg_control(RDMAContext *rdma, int idx)
1198{
1199 rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd,
1200 rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER(512 * 1024),
1201 IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE);
1202 if (rdma->wr_data[idx].control_mr) {
1203 rdma->total_registrations++;
1204 return 0;
1205 }
1206 fprintf(stderrstderr, "qemu_rdma_reg_control failed!\n");
1207 return -1;
1208}
1209
1210const char *print_wrid(int wrid)
1211{
1212 if (wrid >= RDMA_WRID_RECV_CONTROL) {
1213 return wrid_desc[RDMA_WRID_RECV_CONTROL];
1214 }
1215 return wrid_desc[wrid];
1216}
1217
1218/*
1219 * RDMA requires memory registration (mlock/pinning), but this is not good for
1220 * overcommitment.
1221 *
1222 * In preparation for the future where LRU information or workload-specific
1223 * writable writable working set memory access behavior is available to QEMU
1224 * it would be nice to have in place the ability to UN-register/UN-pin
1225 * particular memory regions from the RDMA hardware when it is determine that
1226 * those regions of memory will likely not be accessed again in the near future.
1227 *
1228 * While we do not yet have such information right now, the following
1229 * compile-time option allows us to perform a non-optimized version of this
1230 * behavior.
1231 *
1232 * By uncommenting this option, you will cause *all* RDMA transfers to be
1233 * unregistered immediately after the transfer completes on both sides of the
1234 * connection. This has no effect in 'rdma-pin-all' mode, only regular mode.
1235 *
1236 * This will have a terrible impact on migration performance, so until future
1237 * workload information or LRU information is available, do not attempt to use
1238 * this feature except for basic testing.
1239 */
1240//#define RDMA_UNREGISTRATION_EXAMPLE
1241
1242/*
1243 * Perform a non-optimized memory unregistration after every transfer
1244 * for demonsration purposes, only if pin-all is not requested.
1245 *
1246 * Potential optimizations:
1247 * 1. Start a new thread to run this function continuously
1248 - for bit clearing
1249 - and for receipt of unregister messages
1250 * 2. Use an LRU.
1251 * 3. Use workload hints.
1252 */
1253static int qemu_rdma_unregister_waiting(RDMAContext *rdma)
1254{
1255 while (rdma->unregistrations[rdma->unregister_current]) {
1256 int ret;
1257 uint64_t wr_id = rdma->unregistrations[rdma->unregister_current];
1258 uint64_t chunk =
1259 (wr_id & RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
)) & ~((1UL << 16UL) - 1UL))
) >> RDMA_WRID_CHUNK_SHIFT30UL;
1260 uint64_t index =
1261 (wr_id & RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
))
) >> RDMA_WRID_BLOCK_SHIFT16UL;
1262 RDMALocalBlock *block =
1263 &(rdma->local_ram_blocks.block[index]);
1264 RDMARegister reg = { .current_index = index };
1265 RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED,
1266 };
1267 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1268 .type = RDMA_CONTROL_UNREGISTER_REQUEST,
1269 .repeat = 1,
1270 };
1271
1272 DDPRINTF("Processing unregister for chunk: %" PRIu64do { } while (0)
1273 " at position %d\n", chunk, rdma->unregister_current)do { } while (0);
1274
1275 rdma->unregistrations[rdma->unregister_current] = 0;
1276 rdma->unregister_current++;
1277
1278 if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)) {
1279 rdma->unregister_current = 0;
1280 }
1281
1282
1283 /*
1284 * Unregistration is speculative (because migration is single-threaded
1285 * and we cannot break the protocol's inifinband message ordering).
1286 * Thus, if the memory is currently being used for transmission,
1287 * then abort the attempt to unregister and try again
1288 * later the next time a completion is received for this memory.
1289 */
1290 clear_bit(chunk, block->unregister_bitmap);
1291
1292 if (test_bit(chunk, block->transit_bitmap)) {
1293 DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk)do { } while (0);
1294 continue;
1295 }
1296
1297 DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk)do { } while (0);
1298
1299 ret = ibv_dereg_mr(block->pmr[chunk]);
1300 block->pmr[chunk] = NULL((void*)0);
1301 block->remote_keys[chunk] = 0;
1302
1303 if (ret != 0) {
1304 perror("unregistration chunk failed");
1305 return -ret;
1306 }
1307 rdma->total_registrations--;
1308
1309 reg.key.chunk = chunk;
1310 register_to_network(&reg);
1311 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1312 &resp, NULL((void*)0), NULL((void*)0));
1313 if (ret < 0) {
1314 return ret;
1315 }
1316
1317 DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk)do { } while (0);
1318 }
1319
1320 return 0;
1321}
1322
1323static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index,
1324 uint64_t chunk)
1325{
1326 uint64_t result = wr_id & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL);
1327
1328 result |= (index << RDMA_WRID_BLOCK_SHIFT16UL);
1329 result |= (chunk << RDMA_WRID_CHUNK_SHIFT30UL);
1330
1331 return result;
1332}
1333
1334/*
1335 * Set bit for unregistration in the next iteration.
1336 * We cannot transmit right here, but will unpin later.
1337 */
1338static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index,
1339 uint64_t chunk, uint64_t wr_id)
1340{
1341 if (rdma->unregistrations[rdma->unregister_next] != 0) {
1342 fprintf(stderrstderr, "rdma migration: queue is full!\n");
1343 } else {
1344 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1345
1346 if (!test_and_set_bit(chunk, block->unregister_bitmap)) {
1347 DDPRINTF("Appending unregister chunk %" PRIu64do { } while (0)
1348 " at position %d\n", chunk, rdma->unregister_next)do { } while (0);
1349
1350 rdma->unregistrations[rdma->unregister_next++] =
1351 qemu_rdma_make_wrid(wr_id, index, chunk);
1352
1353 if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)) {
1354 rdma->unregister_next = 0;
1355 }
1356 } else {
1357 DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",do { } while (0)
1358 chunk)do { } while (0);
1359 }
1360 }
1361}
1362
1363/*
1364 * Consult the connection manager to see a work request
1365 * (of any kind) has completed.
1366 * Return the work request ID that completed.
1367 */
1368static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out,
1369 uint32_t *byte_len)
1370{
1371 int ret;
1372 struct ibv_wc wc;
1373 uint64_t wr_id;
1374
1375 ret = ibv_poll_cq(rdma->cq, 1, &wc);
1376
1377 if (!ret) {
1378 *wr_id_out = RDMA_WRID_NONE;
1379 return 0;
1380 }
1381
1382 if (ret < 0) {
1383 fprintf(stderrstderr, "ibv_poll_cq return %d!\n", ret);
1384 return ret;
1385 }
1386
1387 wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL);
1388
1389 if (wc.status != IBV_WC_SUCCESS) {
1390 fprintf(stderrstderr, "ibv_poll_cq wc.status=%d %s!\n",
1391 wc.status, ibv_wc_status_str(wc.status));
1392 fprintf(stderrstderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]);
1393
1394 return -1;
1395 }
1396
1397 if (rdma->control_ready_expected &&
1398 (wr_id >= RDMA_WRID_RECV_CONTROL)) {
1399 DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"do { } while (0)
1400 " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],do { } while (0)
1401 wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent)do { } while (0);
1402 rdma->control_ready_expected = 0;
1403 }
1404
1405 if (wr_id == RDMA_WRID_RDMA_WRITE) {
1406 uint64_t chunk =
1407 (wc.wr_id & RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
)) & ~((1UL << 16UL) - 1UL))
) >> RDMA_WRID_CHUNK_SHIFT30UL;
1408 uint64_t index =
1409 (wc.wr_id & RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL
))
) >> RDMA_WRID_BLOCK_SHIFT16UL;
1410 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]);
1411
1412 DDDPRINTF("completions %s (%" PRId64 ") left %d, "do { } while (0)
1413 "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",do { } while (0)
1414 print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,do { } while (0)
1415 block->local_host_addr, (void *)block->remote_host_addr)do { } while (0);
1416
1417 clear_bit(chunk, block->transit_bitmap);
1418
1419 if (rdma->nb_sent > 0) {
1420 rdma->nb_sent--;
1421 }
1422
1423 if (!rdma->pin_all) {
1424 /*
1425 * FYI: If one wanted to signal a specific chunk to be unregistered
1426 * using LRU or workload-specific information, this is the function
1427 * you would call to do so. That chunk would then get asynchronously
1428 * unregistered later.
1429 */
1430#ifdef RDMA_UNREGISTRATION_EXAMPLE
1431 qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id);
1432#endif
1433 }
1434 } else {
1435 DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",do { } while (0)
1436 print_wrid(wr_id), wr_id, rdma->nb_sent)do { } while (0);
1437 }
1438
1439 *wr_id_out = wc.wr_id;
1440 if (byte_len) {
1441 *byte_len = wc.byte_len;
1442 }
1443
1444 return 0;
1445}
1446
1447/*
1448 * Block until the next work request has completed.
1449 *
1450 * First poll to see if a work request has already completed,
1451 * otherwise block.
1452 *
1453 * If we encounter completed work requests for IDs other than
1454 * the one we're interested in, then that's generally an error.
1455 *
1456 * The only exception is actual RDMA Write completions. These
1457 * completions only need to be recorded, but do not actually
1458 * need further processing.
1459 */
1460static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested,
1461 uint32_t *byte_len)
1462{
1463 int num_cq_events = 0, ret = 0;
1464 struct ibv_cq *cq;
1465 void *cq_ctx;
1466 uint64_t wr_id = RDMA_WRID_NONE, wr_id_in;
1467
1468 if (ibv_req_notify_cq(rdma->cq, 0)) {
1469 return -1;
1470 }
1471 /* poll cq first */
1472 while (wr_id != wrid_requested) {
1473 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1474 if (ret < 0) {
1475 return ret;
1476 }
1477
1478 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL);
1479
1480 if (wr_id == RDMA_WRID_NONE) {
1481 break;
1482 }
1483 if (wr_id != wrid_requested) {
1484 DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",do { } while (0)
1485 print_wrid(wrid_requested),do { } while (0)
1486 wrid_requested, print_wrid(wr_id), wr_id)do { } while (0);
1487 }
1488 }
1489
1490 if (wr_id == wrid_requested) {
1491 return 0;
1492 }
1493
1494 while (1) {
1495 /*
1496 * Coroutine doesn't start until process_incoming_migration()
1497 * so don't yield unless we know we're running inside of a coroutine.
1498 */
1499 if (rdma->migration_started_on_destination) {
1500 yield_until_fd_readable(rdma->comp_channel->fd);
1501 }
1502
1503 if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) {
1504 perror("ibv_get_cq_event");
1505 goto err_block_for_wrid;
1506 }
1507
1508 num_cq_events++;
1509
1510 if (ibv_req_notify_cq(cq, 0)) {
1511 goto err_block_for_wrid;
1512 }
1513
1514 while (wr_id != wrid_requested) {
1515 ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len);
1516 if (ret < 0) {
1517 goto err_block_for_wrid;
1518 }
1519
1520 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL);
1521
1522 if (wr_id == RDMA_WRID_NONE) {
1523 break;
1524 }
1525 if (wr_id != wrid_requested) {
1526 DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",do { } while (0)
1527 print_wrid(wrid_requested), wrid_requested,do { } while (0)
1528 print_wrid(wr_id), wr_id)do { } while (0);
1529 }
1530 }
1531
1532 if (wr_id == wrid_requested) {
1533 goto success_block_for_wrid;
1534 }
1535 }
1536
1537success_block_for_wrid:
1538 if (num_cq_events) {
1539 ibv_ack_cq_events(cq, num_cq_events);
1540 }
1541 return 0;
1542
1543err_block_for_wrid:
1544 if (num_cq_events) {
1545 ibv_ack_cq_events(cq, num_cq_events);
1546 }
1547 return ret;
1548}
1549
1550/*
1551 * Post a SEND message work request for the control channel
1552 * containing some data and block until the post completes.
1553 */
1554static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf,
1555 RDMAControlHeader *head)
1556{
1557 int ret = 0;
1558 RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL];
1559 struct ibv_send_wr *bad_wr;
1560 struct ibv_sge sge = {
1561 .addr = (uint64_t)(wr->control),
1562 .length = head->len + sizeof(RDMAControlHeader),
1563 .lkey = wr->control_mr->lkey,
1564 };
1565 struct ibv_send_wr send_wr = {
1566 .wr_id = RDMA_WRID_SEND_CONTROL,
1567 .opcode = IBV_WR_SEND,
1568 .send_flags = IBV_SEND_SIGNALED,
1569 .sg_list = &sge,
1570 .num_sge = 1,
1571 };
1572
1573 DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type])do { } while (0);
1574
1575 /*
1576 * We don't actually need to do a memcpy() in here if we used
1577 * the "sge" properly, but since we're only sending control messages
1578 * (not RAM in a performance-critical path), then its OK for now.
1579 *
1580 * The copy makes the RDMAControlHeader simpler to manipulate
1581 * for the time being.
1582 */
1583 assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head))((head->len <= (512 * 1024) - sizeof(*head)) ? (void) (
0) : __assert_fail ("head->len <= (512 * 1024) - sizeof(*head)"
, "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 1583
, __PRETTY_FUNCTION__))
;
1584 memcpy(wr->control, head, sizeof(RDMAControlHeader));
1585 control_to_network((void *) wr->control);
1586
1587 if (buf) {
1588 memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len);
1589 }
1590
1591
1592 if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) {
1593 return -1;
1594 }
1595
1596 if (ret < 0) {
1597 fprintf(stderrstderr, "Failed to use post IB SEND for control!\n");
1598 return ret;
1599 }
1600
1601 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL((void*)0));
1602 if (ret < 0) {
1603 fprintf(stderrstderr, "rdma migration: send polling control error!\n");
1604 }
1605
1606 return ret;
1607}
1608
1609/*
1610 * Post a RECV work request in anticipation of some future receipt
1611 * of data on the control channel.
1612 */
1613static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx)
1614{
1615 struct ibv_recv_wr *bad_wr;
1616 struct ibv_sge sge = {
1617 .addr = (uint64_t)(rdma->wr_data[idx].control),
1618 .length = RDMA_CONTROL_MAX_BUFFER(512 * 1024),
1619 .lkey = rdma->wr_data[idx].control_mr->lkey,
1620 };
1621
1622 struct ibv_recv_wr recv_wr = {
1623 .wr_id = RDMA_WRID_RECV_CONTROL + idx,
1624 .sg_list = &sge,
1625 .num_sge = 1,
1626 };
1627
1628
1629 if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) {
1630 return -1;
1631 }
1632
1633 return 0;
1634}
1635
1636/*
1637 * Block and wait for a RECV control channel message to arrive.
1638 */
1639static int qemu_rdma_exchange_get_response(RDMAContext *rdma,
1640 RDMAControlHeader *head, int expecting, int idx)
1641{
1642 uint32_t byte_len;
1643 int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx,
1644 &byte_len);
1645
1646 if (ret < 0) {
1647 fprintf(stderrstderr, "rdma migration: recv polling control error!\n");
1648 return ret;
1649 }
1650
1651 network_to_control((void *) rdma->wr_data[idx].control);
1652 memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader));
1653
1654 DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting])do { } while (0);
1655
1656 if (expecting == RDMA_CONTROL_NONE) {
1657 DDDPRINTF("Surprise: got %s (%d)\n",do { } while (0)
1658 control_desc[head->type], head->type)do { } while (0);
1659 } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) {
1660 fprintf(stderrstderr, "Was expecting a %s (%d) control message"
1661 ", but got: %s (%d), length: %d\n",
1662 control_desc[expecting], expecting,
1663 control_desc[head->type], head->type, head->len);
1664 return -EIO5;
1665 }
1666 if (head->len > RDMA_CONTROL_MAX_BUFFER(512 * 1024) - sizeof(*head)) {
1667 fprintf(stderrstderr, "too long length: %d\n", head->len);
1668 return -EINVAL22;
1669 }
1670 if (sizeof(*head) + head->len != byte_len) {
1671 fprintf(stderrstderr, "Malformed length: %d byte_len %d\n",
1672 head->len, byte_len);
1673 return -EINVAL22;
1674 }
1675
1676 return 0;
1677}
1678
1679/*
1680 * When a RECV work request has completed, the work request's
1681 * buffer is pointed at the header.
1682 *
1683 * This will advance the pointer to the data portion
1684 * of the control message of the work request's buffer that
1685 * was populated after the work request finished.
1686 */
1687static void qemu_rdma_move_header(RDMAContext *rdma, int idx,
1688 RDMAControlHeader *head)
1689{
1690 rdma->wr_data[idx].control_len = head->len;
1691 rdma->wr_data[idx].control_curr =
1692 rdma->wr_data[idx].control + sizeof(RDMAControlHeader);
1693}
1694
1695/*
1696 * This is an 'atomic' high-level operation to deliver a single, unified
1697 * control-channel message.
1698 *
1699 * Additionally, if the user is expecting some kind of reply to this message,
1700 * they can request a 'resp' response message be filled in by posting an
1701 * additional work request on behalf of the user and waiting for an additional
1702 * completion.
1703 *
1704 * The extra (optional) response is used during registration to us from having
1705 * to perform an *additional* exchange of message just to provide a response by
1706 * instead piggy-backing on the acknowledgement.
1707 */
1708static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head,
1709 uint8_t *data, RDMAControlHeader *resp,
1710 int *resp_idx,
1711 int (*callback)(RDMAContext *rdma))
1712{
1713 int ret = 0;
1714
1715 /*
1716 * Wait until the dest is ready before attempting to deliver the message
1717 * by waiting for a READY message.
1718 */
1719 if (rdma->control_ready_expected) {
1720 RDMAControlHeader resp;
1721 ret = qemu_rdma_exchange_get_response(rdma,
1722 &resp, RDMA_CONTROL_READY, RDMA_WRID_READY);
1723 if (ret < 0) {
1724 return ret;
1725 }
1726 }
1727
1728 /*
1729 * If the user is expecting a response, post a WR in anticipation of it.
1730 */
1731 if (resp) {
1732 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA);
1733 if (ret) {
1734 fprintf(stderrstderr, "rdma migration: error posting"
1735 " extra control recv for anticipated result!");
1736 return ret;
1737 }
1738 }
1739
1740 /*
1741 * Post a WR to replace the one we just consumed for the READY message.
1742 */
1743 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1744 if (ret) {
1745 fprintf(stderrstderr, "rdma migration: error posting first control recv!");
1746 return ret;
1747 }
1748
1749 /*
1750 * Deliver the control message that was requested.
1751 */
1752 ret = qemu_rdma_post_send_control(rdma, data, head);
1753
1754 if (ret < 0) {
1755 fprintf(stderrstderr, "Failed to send control buffer!\n");
1756 return ret;
1757 }
1758
1759 /*
1760 * If we're expecting a response, block and wait for it.
1761 */
1762 if (resp) {
1763 if (callback) {
1764 DDPRINTF("Issuing callback before receiving response...\n")do { } while (0);
1765 ret = callback(rdma);
1766 if (ret < 0) {
1767 return ret;
1768 }
1769 }
1770
1771 DDPRINTF("Waiting for response %s\n", control_desc[resp->type])do { } while (0);
1772 ret = qemu_rdma_exchange_get_response(rdma, resp,
1773 resp->type, RDMA_WRID_DATA);
1774
1775 if (ret < 0) {
1776 return ret;
1777 }
1778
1779 qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp);
1780 if (resp_idx) {
1781 *resp_idx = RDMA_WRID_DATA;
1782 }
1783 DDPRINTF("Response %s received.\n", control_desc[resp->type])do { } while (0);
1784 }
1785
1786 rdma->control_ready_expected = 1;
1787
1788 return 0;
1789}
1790
1791/*
1792 * This is an 'atomic' high-level operation to receive a single, unified
1793 * control-channel message.
1794 */
1795static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head,
1796 int expecting)
1797{
1798 RDMAControlHeader ready = {
1799 .len = 0,
1800 .type = RDMA_CONTROL_READY,
1801 .repeat = 1,
1802 };
1803 int ret;
1804
1805 /*
1806 * Inform the source that we're ready to receive a message.
1807 */
1808 ret = qemu_rdma_post_send_control(rdma, NULL((void*)0), &ready);
1809
1810 if (ret < 0) {
1811 fprintf(stderrstderr, "Failed to send control buffer!\n");
1812 return ret;
1813 }
1814
1815 /*
1816 * Block and wait for the message.
1817 */
1818 ret = qemu_rdma_exchange_get_response(rdma, head,
1819 expecting, RDMA_WRID_READY);
1820
1821 if (ret < 0) {
1822 return ret;
1823 }
1824
1825 qemu_rdma_move_header(rdma, RDMA_WRID_READY, head);
1826
1827 /*
1828 * Post a new RECV work request to replace the one we just consumed.
1829 */
1830 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
1831 if (ret) {
1832 fprintf(stderrstderr, "rdma migration: error posting second control recv!");
1833 return ret;
1834 }
1835
1836 return 0;
1837}
1838
1839/*
1840 * Write an actual chunk of memory using RDMA.
1841 *
1842 * If we're using dynamic registration on the dest-side, we have to
1843 * send a registration command first.
1844 */
1845static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma,
1846 int current_index, uint64_t current_addr,
1847 uint64_t length)
1848{
1849 struct ibv_sge sge;
1850 struct ibv_send_wr send_wr = { 0 };
1851 struct ibv_send_wr *bad_wr;
1852 int reg_result_idx, ret, count = 0;
1853 uint64_t chunk, chunks;
1854 uint8_t *chunk_start, *chunk_end;
1855 RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]);
1856 RDMARegister reg;
1857 RDMARegisterResult *reg_result;
1858 RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT };
1859 RDMAControlHeader head = { .len = sizeof(RDMARegister),
1860 .type = RDMA_CONTROL_REGISTER_REQUEST,
1861 .repeat = 1,
1862 };
1863
1864retry:
1865 sge.addr = (uint64_t)(block->local_host_addr +
1866 (current_addr - block->offset));
1867 sge.length = length;
1868
1869 chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr);
1870 chunk_start = ram_chunk_start(block, chunk);
1871
1872 if (block->is_ram_block) {
1873 chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT20);
1874
1875 if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT20)) == 0)) {
1876 chunks--;
1877 }
1878 } else {
1879 chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT20);
1880
1881 if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT20)) == 0)) {
1882 chunks--;
1883 }
1884 }
1885
1886 DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",do { } while (0)
1887 chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024)do { } while (0);
1888
1889 chunk_end = ram_chunk_end(block, chunk + chunks);
1890
1891 if (!rdma->pin_all) {
1892#ifdef RDMA_UNREGISTRATION_EXAMPLE
1893 qemu_rdma_unregister_waiting(rdma);
1894#endif
1895 }
1896
1897 while (test_bit(chunk, block->transit_bitmap)) {
1898 (void)count;
1899 DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64do { } while (0)
1900 " current %" PRIu64 " len %" PRIu64 " %d %d\n",do { } while (0)
1901 count++, current_index, chunk,do { } while (0)
1902 sge.addr, length, rdma->nb_sent, block->nb_chunks)do { } while (0);
1903
1904 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0));
1905
1906 if (ret < 0) {
1907 fprintf(stderrstderr, "Failed to Wait for previous write to complete "
1908 "block %d chunk %" PRIu64"l" "u"
1909 " current %" PRIu64"l" "u" " len %" PRIu64"l" "u" " %d\n",
1910 current_index, chunk, sge.addr, length, rdma->nb_sent);
1911 return ret;
1912 }
1913 }
1914
1915 if (!rdma->pin_all || !block->is_ram_block) {
1916 if (!block->remote_keys[chunk]) {
1917 /*
1918 * This chunk has not yet been registered, so first check to see
1919 * if the entire chunk is zero. If so, tell the other size to
1920 * memset() + madvise() the entire chunk without RDMA.
1921 */
1922
1923 if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length)
1924 && buffer_find_nonzero_offset((void *)sge.addr,
1925 length) == length) {
1926 RDMACompress comp = {
1927 .offset = current_addr,
1928 .value = 0,
1929 .block_idx = current_index,
1930 .length = length,
1931 };
1932
1933 head.len = sizeof(comp);
1934 head.type = RDMA_CONTROL_COMPRESS;
1935
1936 DDPRINTF("Entire chunk is zero, sending compress: %"do { } while (0)
1937 PRIu64 " for %d "do { } while (0)
1938 "bytes, index: %d, offset: %" PRId64 "...\n",do { } while (0)
1939 chunk, sge.length, current_index, current_addr)do { } while (0);
1940
1941 compress_to_network(&comp);
1942 ret = qemu_rdma_exchange_send(rdma, &head,
1943 (uint8_t *) &comp, NULL((void*)0), NULL((void*)0), NULL((void*)0));
1944
1945 if (ret < 0) {
1946 return -EIO5;
1947 }
1948
1949 acct_update_position(f, sge.length, true1);
1950
1951 return 1;
1952 }
1953
1954 /*
1955 * Otherwise, tell other side to register.
1956 */
1957 reg.current_index = current_index;
1958 if (block->is_ram_block) {
1959 reg.key.current_addr = current_addr;
1960 } else {
1961 reg.key.chunk = chunk;
1962 }
1963 reg.chunks = chunks;
1964
1965 DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "do { } while (0)
1966 "bytes, index: %d, offset: %" PRId64 "...\n",do { } while (0)
1967 chunk, sge.length, current_index, current_addr)do { } while (0);
1968
1969 register_to_network(&reg);
1970 ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) &reg,
1971 &resp, &reg_result_idx, NULL((void*)0));
1972 if (ret < 0) {
1973 return ret;
1974 }
1975
1976 /* try to overlap this single registration with the one we sent. */
1977 if (qemu_rdma_register_and_get_keys(rdma, block,
1978 (uint8_t *) sge.addr,
1979 &sge.lkey, NULL((void*)0), chunk,
1980 chunk_start, chunk_end)) {
1981 fprintf(stderrstderr, "cannot get lkey!\n");
1982 return -EINVAL22;
1983 }
1984
1985 reg_result = (RDMARegisterResult *)
1986 rdma->wr_data[reg_result_idx].control_curr;
1987
1988 network_to_result(reg_result);
1989
1990 DDPRINTF("Received registration result:"do { } while (0)
1991 " my key: %x their key %x, chunk %" PRIu64 "\n",do { } while (0)
1992 block->remote_keys[chunk], reg_result->rkey, chunk)do { } while (0);
1993
1994 block->remote_keys[chunk] = reg_result->rkey;
1995 block->remote_host_addr = reg_result->host_addr;
1996 } else {
1997 /* already registered before */
1998 if (qemu_rdma_register_and_get_keys(rdma, block,
1999 (uint8_t *)sge.addr,
2000 &sge.lkey, NULL((void*)0), chunk,
2001 chunk_start, chunk_end)) {
2002 fprintf(stderrstderr, "cannot get lkey!\n");
2003 return -EINVAL22;
2004 }
2005 }
2006
2007 send_wr.wr.rdma.rkey = block->remote_keys[chunk];
2008 } else {
2009 send_wr.wr.rdma.rkey = block->remote_rkey;
2010
2011 if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr,
2012 &sge.lkey, NULL((void*)0), chunk,
2013 chunk_start, chunk_end)) {
2014 fprintf(stderrstderr, "cannot get lkey!\n");
2015 return -EINVAL22;
2016 }
2017 }
2018
2019 /*
2020 * Encode the ram block index and chunk within this wrid.
2021 * We will use this information at the time of completion
2022 * to figure out which bitmap to check against and then which
2023 * chunk in the bitmap to look for.
2024 */
2025 send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE,
2026 current_index, chunk);
2027
2028 send_wr.opcode = IBV_WR_RDMA_WRITE;
2029 send_wr.send_flags = IBV_SEND_SIGNALED;
2030 send_wr.sg_list = &sge;
2031 send_wr.num_sge = 1;
2032 send_wr.wr.rdma.remote_addr = block->remote_host_addr +
2033 (current_addr - block->offset);
2034
2035 DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"do { } while (0)
2036 " remote: %lx, bytes %" PRIu32 "\n",do { } while (0)
2037 chunk, sge.addr, send_wr.wr.rdma.remote_addr,do { } while (0)
2038 sge.length)do { } while (0);
2039
2040 /*
2041 * ibv_post_send() does not return negative error numbers,
2042 * per the specification they are positive - no idea why.
2043 */
2044 ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr);
2045
2046 if (ret == ENOMEM12) {
2047 DDPRINTF("send queue is full. wait a little....\n")do { } while (0);
2048 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0));
2049 if (ret < 0) {
2050 fprintf(stderrstderr, "rdma migration: failed to make "
2051 "room in full send queue! %d\n", ret);
2052 return ret;
2053 }
2054
2055 goto retry;
2056
2057 } else if (ret > 0) {
2058 perror("rdma migration: post rdma write failed");
2059 return -ret;
2060 }
2061
2062 set_bit(chunk, block->transit_bitmap);
2063 acct_update_position(f, sge.length, false0);
2064 rdma->total_writes++;
2065
2066 return 0;
2067}
2068
2069/*
2070 * Push out any unwritten RDMA operations.
2071 *
2072 * We support sending out multiple chunks at the same time.
2073 * Not all of them need to get signaled in the completion queue.
2074 */
2075static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma)
2076{
2077 int ret;
2078
2079 if (!rdma->current_length) {
2080 return 0;
2081 }
2082
2083 ret = qemu_rdma_write_one(f, rdma,
2084 rdma->current_index, rdma->current_addr, rdma->current_length);
2085
2086 if (ret < 0) {
2087 return ret;
2088 }
2089
2090 if (ret == 0) {
2091 rdma->nb_sent++;
2092 DDDPRINTF("sent total: %d\n", rdma->nb_sent)do { } while (0);
2093 }
2094
2095 rdma->current_length = 0;
2096 rdma->current_addr = 0;
2097
2098 return 0;
2099}
2100
2101static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma,
2102 uint64_t offset, uint64_t len)
2103{
2104 RDMALocalBlock *block;
2105 uint8_t *host_addr;
2106 uint8_t *chunk_end;
2107
2108 if (rdma->current_index < 0) {
2109 return 0;
2110 }
2111
2112 if (rdma->current_chunk < 0) {
2113 return 0;
2114 }
2115
2116 block = &(rdma->local_ram_blocks.block[rdma->current_index]);
2117 host_addr = block->local_host_addr + (offset - block->offset);
2118 chunk_end = ram_chunk_end(block, rdma->current_chunk);
2119
2120 if (rdma->current_length == 0) {
2121 return 0;
2122 }
2123
2124 /*
2125 * Only merge into chunk sequentially.
2126 */
2127 if (offset != (rdma->current_addr + rdma->current_length)) {
2128 return 0;
2129 }
2130
2131 if (offset < block->offset) {
2132 return 0;
2133 }
2134
2135 if ((offset + len) > (block->offset + block->length)) {
2136 return 0;
2137 }
2138
2139 if ((host_addr + len) > chunk_end) {
2140 return 0;
2141 }
2142
2143 return 1;
2144}
2145
2146/*
2147 * We're not actually writing here, but doing three things:
2148 *
2149 * 1. Identify the chunk the buffer belongs to.
2150 * 2. If the chunk is full or the buffer doesn't belong to the current
2151 * chunk, then start a new chunk and flush() the old chunk.
2152 * 3. To keep the hardware busy, we also group chunks into batches
2153 * and only require that a batch gets acknowledged in the completion
2154 * qeueue instead of each individual chunk.
2155 */
2156static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma,
2157 uint64_t block_offset, uint64_t offset,
2158 uint64_t len)
2159{
2160 uint64_t current_addr = block_offset + offset;
2161 uint64_t index = rdma->current_index;
2162 uint64_t chunk = rdma->current_chunk;
2163 int ret;
2164
2165 /* If we cannot merge it, we flush the current buffer first. */
2166 if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) {
2167 ret = qemu_rdma_write_flush(f, rdma);
2168 if (ret) {
2169 return ret;
2170 }
2171 rdma->current_length = 0;
2172 rdma->current_addr = current_addr;
2173
2174 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2175 offset, len, &index, &chunk);
2176 if (ret) {
2177 fprintf(stderrstderr, "ram block search failed\n");
2178 return ret;
2179 }
2180 rdma->current_index = index;
2181 rdma->current_chunk = chunk;
2182 }
2183
2184 /* merge it */
2185 rdma->current_length += len;
2186
2187 /* flush it if buffer is too large */
2188 if (rdma->current_length >= RDMA_MERGE_MAX(2 * 1024 * 1024)) {
2189 return qemu_rdma_write_flush(f, rdma);
2190 }
2191
2192 return 0;
2193}
2194
2195static void qemu_rdma_cleanup(RDMAContext *rdma)
2196{
2197 struct rdma_cm_event *cm_event;
2198 int ret, idx;
2199
2200 if (rdma->cm_id && rdma->connected) {
2201 if (rdma->error_state) {
2202 RDMAControlHeader head = { .len = 0,
2203 .type = RDMA_CONTROL_ERROR,
2204 .repeat = 1,
2205 };
2206 fprintf(stderrstderr, "Early error. Sending error.\n");
2207 qemu_rdma_post_send_control(rdma, NULL((void*)0), &head);
2208 }
2209
2210 ret = rdma_disconnect(rdma->cm_id);
2211 if (!ret) {
2212 DDPRINTF("waiting for disconnect\n")do { } while (0);
2213 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2214 if (!ret) {
2215 rdma_ack_cm_event(cm_event);
2216 }
2217 }
2218 DDPRINTF("Disconnected.\n")do { } while (0);
2219 rdma->connected = false0;
2220 }
2221
2222 g_free(rdma->block);
2223 rdma->block = NULL((void*)0);
2224
2225 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
12
Loop condition is true. Entering loop body
14
Loop condition is true. Entering loop body
16
Loop condition is true. Entering loop body
18
Loop condition is false. Execution continues on line 2233
2226 if (rdma->wr_data[idx].control_mr) {
13
Taking false branch
15
Taking false branch
17
Taking false branch
2227 rdma->total_registrations--;
2228 ibv_dereg_mr(rdma->wr_data[idx].control_mr);
2229 }
2230 rdma->wr_data[idx].control_mr = NULL((void*)0);
2231 }
2232
2233 if (rdma->local_ram_blocks.block) {
19
Taking true branch
2234 while (rdma->local_ram_blocks.nb_blocks) {
20
Loop condition is true. Entering loop body
2235 __qemu_rdma_delete_block(rdma,
21
Calling '__qemu_rdma_delete_block'
2236 rdma->local_ram_blocks.block->offset);
2237 }
2238 }
2239
2240 if (rdma->qp) {
2241 rdma_destroy_qp(rdma->cm_id);
2242 rdma->qp = NULL((void*)0);
2243 }
2244 if (rdma->cq) {
2245 ibv_destroy_cq(rdma->cq);
2246 rdma->cq = NULL((void*)0);
2247 }
2248 if (rdma->comp_channel) {
2249 ibv_destroy_comp_channel(rdma->comp_channel);
2250 rdma->comp_channel = NULL((void*)0);
2251 }
2252 if (rdma->pd) {
2253 ibv_dealloc_pd(rdma->pd);
2254 rdma->pd = NULL((void*)0);
2255 }
2256 if (rdma->listen_id) {
2257 rdma_destroy_id(rdma->listen_id);
2258 rdma->listen_id = NULL((void*)0);
2259 }
2260 if (rdma->cm_id) {
2261 rdma_destroy_id(rdma->cm_id);
2262 rdma->cm_id = NULL((void*)0);
2263 }
2264 if (rdma->channel) {
2265 rdma_destroy_event_channel(rdma->channel);
2266 rdma->channel = NULL((void*)0);
2267 }
2268 g_free(rdma->host);
2269 rdma->host = NULL((void*)0);
2270}
2271
2272
2273static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool_Bool pin_all)
2274{
2275 int ret, idx;
2276 Error *local_err = NULL((void*)0), **temp = &local_err;
2277
2278 /*
2279 * Will be validated against destination's actual capabilities
2280 * after the connect() completes.
2281 */
2282 rdma->pin_all = pin_all;
2283
2284 ret = qemu_rdma_resolve_host(rdma, temp);
2285 if (ret) {
3
Assuming 'ret' is 0
4
Taking false branch
2286 goto err_rdma_source_init;
2287 }
2288
2289 ret = qemu_rdma_alloc_pd_cq(rdma);
2290 if (ret) {
5
Taking false branch
2291 ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
"\n"); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
); } } while (0)
2292 " limits may be too low. Please check $ ulimit -a # and "do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
"\n"); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
); } } while (0)
2293 "search for 'ulimit -l' in the output")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
"\n"); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()"
" limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output"
); } } while (0)
;
2294 goto err_rdma_source_init;
2295 }
2296
2297 ret = qemu_rdma_alloc_qp(rdma);
2298 if (ret) {
6
Taking false branch
2299 ERROR(temp, "rdma migration: error allocating qp!")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating qp!"
"\n"); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating qp!"
); } } while (0)
;
2300 goto err_rdma_source_init;
2301 }
2302
2303 ret = qemu_rdma_init_ram_blocks(rdma);
2304 if (ret) {
7
Taking false branch
2305 ERROR(temp, "rdma migration: error initializing ram blocks!")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error initializing ram blocks!"
"\n"); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error initializing ram blocks!"
); } } while (0)
;
2306 goto err_rdma_source_init;
2307 }
2308
2309 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
8
Loop condition is true. Entering loop body
2310 ret = qemu_rdma_reg_control(rdma, idx);
2311 if (ret) {
9
Taking true branch
2312 ERROR(temp, "rdma migration: error registering %d control!",do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error registering %d control!"
"\n", idx); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error registering %d control!"
, idx); } } while (0)
2313 idx)do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error registering %d control!"
"\n", idx); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error registering %d control!"
, idx); } } while (0)
;
2314 goto err_rdma_source_init;
10
Control jumps to line 2321
2315 }
2316 }
2317
2318 return 0;
2319
2320err_rdma_source_init:
2321 error_propagate(errp, local_err);
2322 qemu_rdma_cleanup(rdma);
11
Calling 'qemu_rdma_cleanup'
2323 return -1;
2324}
2325
2326static int qemu_rdma_connect(RDMAContext *rdma, Error **errp)
2327{
2328 RDMACapabilities cap = {
2329 .version = RDMA_CONTROL_VERSION_CURRENT1,
2330 .flags = 0,
2331 };
2332 struct rdma_conn_param conn_param = { .initiator_depth = 2,
2333 .retry_count = 5,
2334 .private_data = &cap,
2335 .private_data_len = sizeof(cap),
2336 };
2337 struct rdma_cm_event *cm_event;
2338 int ret;
2339
2340 /*
2341 * Only negotiate the capability with destination if the user
2342 * on the source first requested the capability.
2343 */
2344 if (rdma->pin_all) {
2345 DPRINTF("Server pin-all memory requested.\n")do { } while (0);
2346 cap.flags |= RDMA_CAPABILITY_PIN_ALL0x01;
2347 }
2348
2349 caps_to_network(&cap);
2350
2351 ret = rdma_connect(rdma->cm_id, &conn_param);
2352 if (ret) {
2353 perror("rdma_connect");
2354 ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!"
); } } while (0)
;
2355 rdma_destroy_id(rdma->cm_id);
2356 rdma->cm_id = NULL((void*)0);
2357 goto err_rdma_source_connect;
2358 }
2359
2360 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2361 if (ret) {
2362 perror("rdma_get_cm_event after rdma_connect");
2363 ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!"
); } } while (0)
;
2364 rdma_ack_cm_event(cm_event);
2365 rdma_destroy_id(rdma->cm_id);
2366 rdma->cm_id = NULL((void*)0);
2367 goto err_rdma_source_connect;
2368 }
2369
2370 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2371 perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect");
2372 ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!"
); } } while (0)
;
2373 rdma_ack_cm_event(cm_event);
2374 rdma_destroy_id(rdma->cm_id);
2375 rdma->cm_id = NULL((void*)0);
2376 goto err_rdma_source_connect;
2377 }
2378 rdma->connected = true1;
2379
2380 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2381 network_to_caps(&cap);
2382
2383 /*
2384 * Verify that the *requested* capabilities are supported by the destination
2385 * and disable them otherwise.
2386 */
2387 if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL0x01)) {
2388 ERROR(errp, "Server cannot support pinning all memory. "do { fprintf(stderr, "RDMA ERROR: " "Server cannot support pinning all memory. "
"Will register memory dynamically." "\n"); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "Server cannot support pinning all memory. "
"Will register memory dynamically."); } } while (0)
2389 "Will register memory dynamically.")do { fprintf(stderr, "RDMA ERROR: " "Server cannot support pinning all memory. "
"Will register memory dynamically." "\n"); if (errp &&
(*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR
, "RDMA ERROR: " "Server cannot support pinning all memory. "
"Will register memory dynamically."); } } while (0)
;
2390 rdma->pin_all = false0;
2391 }
2392
2393 DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled")do { } while (0);
2394
2395 rdma_ack_cm_event(cm_event);
2396
2397 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2398 if (ret) {
2399 ERROR(errp, "posting second control recv!")do { fprintf(stderr, "RDMA ERROR: " "posting second control recv!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "posting second control recv!"
); } } while (0)
;
2400 goto err_rdma_source_connect;
2401 }
2402
2403 rdma->control_ready_expected = 1;
2404 rdma->nb_sent = 0;
2405 return 0;
2406
2407err_rdma_source_connect:
2408 qemu_rdma_cleanup(rdma);
2409 return -1;
2410}
2411
2412static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp)
2413{
2414 int ret = -EINVAL22, idx;
2415 struct rdma_cm_id *listen_id;
2416 char ip[40] = "unknown";
2417 struct rdma_addrinfo *res;
2418 char port_str[16];
2419
2420 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2421 rdma->wr_data[idx].control_len = 0;
2422 rdma->wr_data[idx].control_curr = NULL((void*)0);
2423 }
2424
2425 if (rdma->host == NULL((void*)0)) {
2426 ERROR(errp, "RDMA host is not set!")do { fprintf(stderr, "RDMA ERROR: " "RDMA host is not set!" "\n"
); if (errp && (*(errp) == ((void*)0))) { error_set(errp
, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA host is not set!"
); } } while (0)
;
2427 rdma->error_state = -EINVAL22;
2428 return -1;
2429 }
2430 /* create CM channel */
2431 rdma->channel = rdma_create_event_channel();
2432 if (!rdma->channel) {
2433 ERROR(errp, "could not create rdma event channel")do { fprintf(stderr, "RDMA ERROR: " "could not create rdma event channel"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create rdma event channel"
); } } while (0)
;
2434 rdma->error_state = -EINVAL22;
2435 return -1;
2436 }
2437
2438 /* create CM id */
2439 ret = rdma_create_id(rdma->channel, &listen_id, NULL((void*)0), RDMA_PS_TCP);
2440 if (ret) {
2441 ERROR(errp, "could not create cm_id!")do { fprintf(stderr, "RDMA ERROR: " "could not create cm_id!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create cm_id!"
); } } while (0)
;
2442 goto err_dest_init_create_listen_id;
2443 }
2444
2445 snprintf(port_str, 16, "%d", rdma->port);
2446 port_str[15] = '\0';
2447
2448 if (rdma->host && strcmp("", rdma->host)) {
2449 struct rdma_addrinfo *e;
2450
2451 ret = rdma_getaddrinfo(rdma->host, port_str, NULL((void*)0), &res);
2452 if (ret < 0) {
2453 ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not rdma_getaddrinfo address %s"
"\n", rdma->host); if (errp && (*(errp) == ((void
*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: "
"could not rdma_getaddrinfo address %s", rdma->host); } }
while (0)
;
2454 goto err_dest_init_bind_addr;
2455 }
2456
2457 for (e = res; e != NULL((void*)0); e = e->ai_next) {
2458 inet_ntop(e->ai_family,
2459 &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip);
2460 DPRINTF("Trying %s => %s\n", rdma->host, ip)do { } while (0);
2461 ret = rdma_bind_addr(listen_id, e->ai_dst_addr);
2462 if (!ret) {
2463 if (e->ai_family == AF_INET610) {
2464 ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs);
2465 if (ret) {
2466 continue;
2467 }
2468 }
2469
2470 goto listen;
2471 }
2472 }
2473
2474 ERROR(errp, "Error: could not rdma_bind_addr!")do { fprintf(stderr, "RDMA ERROR: " "Error: could not rdma_bind_addr!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Error: could not rdma_bind_addr!"
); } } while (0)
;
2475 goto err_dest_init_bind_addr;
2476 } else {
2477 ERROR(errp, "migration host and port not specified!")do { fprintf(stderr, "RDMA ERROR: " "migration host and port not specified!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "migration host and port not specified!"
); } } while (0)
;
2478 ret = -EINVAL22;
2479 goto err_dest_init_bind_addr;
2480 }
2481listen:
2482
2483 rdma->listen_id = listen_id;
2484 qemu_rdma_dump_gid("dest_init", listen_id);
2485 return 0;
2486
2487err_dest_init_bind_addr:
2488 rdma_destroy_id(listen_id);
2489err_dest_init_create_listen_id:
2490 rdma_destroy_event_channel(rdma->channel);
2491 rdma->channel = NULL((void*)0);
2492 rdma->error_state = ret;
2493 return ret;
2494
2495}
2496
2497static void *qemu_rdma_data_init(const char *host_port, Error **errp)
2498{
2499 RDMAContext *rdma = NULL((void*)0);
2500 InetSocketAddress *addr;
2501
2502 if (host_port) {
2503 rdma = g_malloc0(sizeof(RDMAContext));
2504 memset(rdma, 0, sizeof(RDMAContext));
2505 rdma->current_index = -1;
2506 rdma->current_chunk = -1;
2507
2508 addr = inet_parse(host_port, NULL((void*)0));
2509 if (addr != NULL((void*)0)) {
2510 rdma->port = atoi(addr->port);
2511 rdma->host = g_strdup(addr->host);
2512 } else {
2513 ERROR(errp, "bad RDMA migration address '%s'", host_port)do { fprintf(stderr, "RDMA ERROR: " "bad RDMA migration address '%s'"
"\n", host_port); if (errp && (*(errp) == ((void*)0)
)) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: "
"bad RDMA migration address '%s'", host_port); } } while (0)
;
2514 g_free(rdma);
2515 return NULL((void*)0);
2516 }
2517 }
2518
2519 return rdma;
2520}
2521
2522/*
2523 * QEMUFile interface to the control channel.
2524 * SEND messages for control only.
2525 * pc.ram is handled with regular RDMA messages.
2526 */
2527static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf,
2528 int64_t pos, int size)
2529{
2530 QEMUFileRDMA *r = opaque;
2531 QEMUFile *f = r->file;
2532 RDMAContext *rdma = r->rdma;
2533 size_t remaining = size;
2534 uint8_t * data = (void *) buf;
2535 int ret;
2536
2537 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
2538
2539 /*
2540 * Push out any writes that
2541 * we're queued up for pc.ram.
2542 */
2543 ret = qemu_rdma_write_flush(f, rdma);
2544 if (ret < 0) {
2545 rdma->error_state = ret;
2546 return ret;
2547 }
2548
2549 while (remaining) {
2550 RDMAControlHeader head;
2551
2552 r->len = MIN(remaining, RDMA_SEND_INCREMENT)(((remaining) < (32768)) ? (remaining) : (32768));
2553 remaining -= r->len;
2554
2555 head.len = r->len;
2556 head.type = RDMA_CONTROL_QEMU_FILE;
2557
2558 ret = qemu_rdma_exchange_send(rdma, &head, data, NULL((void*)0), NULL((void*)0), NULL((void*)0));
2559
2560 if (ret < 0) {
2561 rdma->error_state = ret;
2562 return ret;
2563 }
2564
2565 data += r->len;
2566 }
2567
2568 return size;
2569}
2570
2571static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf,
2572 int size, int idx)
2573{
2574 size_t len = 0;
2575
2576 if (rdma->wr_data[idx].control_len) {
2577 DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",do { } while (0)
2578 rdma->wr_data[idx].control_len, size)do { } while (0);
2579
2580 len = MIN(size, rdma->wr_data[idx].control_len)(((size) < (rdma->wr_data[idx].control_len)) ? (size) :
(rdma->wr_data[idx].control_len))
;
2581 memcpy(buf, rdma->wr_data[idx].control_curr, len);
2582 rdma->wr_data[idx].control_curr += len;
2583 rdma->wr_data[idx].control_len -= len;
2584 }
2585
2586 return len;
2587}
2588
2589/*
2590 * QEMUFile interface to the control channel.
2591 * RDMA links don't use bytestreams, so we have to
2592 * return bytes to QEMUFile opportunistically.
2593 */
2594static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf,
2595 int64_t pos, int size)
2596{
2597 QEMUFileRDMA *r = opaque;
2598 RDMAContext *rdma = r->rdma;
2599 RDMAControlHeader head;
2600 int ret = 0;
2601
2602 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
2603
2604 /*
2605 * First, we hold on to the last SEND message we
2606 * were given and dish out the bytes until we run
2607 * out of bytes.
2608 */
2609 r->len = qemu_rdma_fill(r->rdma, buf, size, 0);
2610 if (r->len) {
2611 return r->len;
2612 }
2613
2614 /*
2615 * Once we run out, we block and wait for another
2616 * SEND message to arrive.
2617 */
2618 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE);
2619
2620 if (ret < 0) {
2621 rdma->error_state = ret;
2622 return ret;
2623 }
2624
2625 /*
2626 * SEND was received with new bytes, now try again.
2627 */
2628 return qemu_rdma_fill(r->rdma, buf, size, 0);
2629}
2630
2631/*
2632 * Block until all the outstanding chunks have been delivered by the hardware.
2633 */
2634static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma)
2635{
2636 int ret;
2637
2638 if (qemu_rdma_write_flush(f, rdma) < 0) {
2639 return -EIO5;
2640 }
2641
2642 while (rdma->nb_sent) {
2643 ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0));
2644 if (ret < 0) {
2645 fprintf(stderrstderr, "rdma migration: complete polling error!\n");
2646 return -EIO5;
2647 }
2648 }
2649
2650 qemu_rdma_unregister_waiting(rdma);
2651
2652 return 0;
2653}
2654
2655static int qemu_rdma_close(void *opaque)
2656{
2657 DPRINTF("Shutting down connection.\n")do { } while (0);
2658 QEMUFileRDMA *r = opaque;
2659 if (r->rdma) {
2660 qemu_rdma_cleanup(r->rdma);
2661 g_free(r->rdma);
2662 }
2663 g_free(r);
2664 return 0;
2665}
2666
2667/*
2668 * Parameters:
2669 * @offset == 0 :
2670 * This means that 'block_offset' is a full virtual address that does not
2671 * belong to a RAMBlock of the virtual machine and instead
2672 * represents a private malloc'd memory area that the caller wishes to
2673 * transfer.
2674 *
2675 * @offset != 0 :
2676 * Offset is an offset to be added to block_offset and used
2677 * to also lookup the corresponding RAMBlock.
2678 *
2679 * @size > 0 :
2680 * Initiate an transfer this size.
2681 *
2682 * @size == 0 :
2683 * A 'hint' or 'advice' that means that we wish to speculatively
2684 * and asynchronously unregister this memory. In this case, there is no
2685 * guarantee that the unregister will actually happen, for example,
2686 * if the memory is being actively transmitted. Additionally, the memory
2687 * may be re-registered at any future time if a write within the same
2688 * chunk was requested again, even if you attempted to unregister it
2689 * here.
2690 *
2691 * @size < 0 : TODO, not yet supported
2692 * Unregister the memory NOW. This means that the caller does not
2693 * expect there to be any future RDMA transfers and we just want to clean
2694 * things up. This is used in case the upper layer owns the memory and
2695 * cannot wait for qemu_fclose() to occur.
2696 *
2697 * @bytes_sent : User-specificed pointer to indicate how many bytes were
2698 * sent. Usually, this will not be more than a few bytes of
2699 * the protocol because most transfers are sent asynchronously.
2700 */
2701static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque,
2702 ram_addr_t block_offset, ram_addr_t offset,
2703 size_t size, int *bytes_sent)
2704{
2705 QEMUFileRDMA *rfile = opaque;
2706 RDMAContext *rdma = rfile->rdma;
2707 int ret;
2708
2709 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
2710
2711 qemu_fflush(f);
2712
2713 if (size > 0) {
2714 /*
2715 * Add this page to the current 'chunk'. If the chunk
2716 * is full, or the page doen't belong to the current chunk,
2717 * an actual RDMA write will occur and a new chunk will be formed.
2718 */
2719 ret = qemu_rdma_write(f, rdma, block_offset, offset, size);
2720 if (ret < 0) {
2721 fprintf(stderrstderr, "rdma migration: write error! %d\n", ret);
2722 goto err;
2723 }
2724
2725 /*
2726 * We always return 1 bytes because the RDMA
2727 * protocol is completely asynchronous. We do not yet know
2728 * whether an identified chunk is zero or not because we're
2729 * waiting for other pages to potentially be merged with
2730 * the current chunk. So, we have to call qemu_update_position()
2731 * later on when the actual write occurs.
2732 */
2733 if (bytes_sent) {
2734 *bytes_sent = 1;
2735 }
2736 } else {
2737 uint64_t index, chunk;
2738
2739 /* TODO: Change QEMUFileOps prototype to be signed: size_t => long
2740 if (size < 0) {
2741 ret = qemu_rdma_drain_cq(f, rdma);
2742 if (ret < 0) {
2743 fprintf(stderr, "rdma: failed to synchronously drain"
2744 " completion queue before unregistration.\n");
2745 goto err;
2746 }
2747 }
2748 */
2749
2750 ret = qemu_rdma_search_ram_block(rdma, block_offset,
2751 offset, size, &index, &chunk);
2752
2753 if (ret) {
2754 fprintf(stderrstderr, "ram block search failed\n");
2755 goto err;
2756 }
2757
2758 qemu_rdma_signal_unregister(rdma, index, chunk, 0);
2759
2760 /*
2761 * TODO: Synchronous, guaranteed unregistration (should not occur during
2762 * fast-path). Otherwise, unregisters will process on the next call to
2763 * qemu_rdma_drain_cq()
2764 if (size < 0) {
2765 qemu_rdma_unregister_waiting(rdma);
2766 }
2767 */
2768 }
2769
2770 /*
2771 * Drain the Completion Queue if possible, but do not block,
2772 * just poll.
2773 *
2774 * If nothing to poll, the end of the iteration will do this
2775 * again to make sure we don't overflow the request queue.
2776 */
2777 while (1) {
2778 uint64_t wr_id, wr_id_in;
2779 int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL((void*)0));
2780 if (ret < 0) {
2781 fprintf(stderrstderr, "rdma migration: polling error! %d\n", ret);
2782 goto err;
2783 }
2784
2785 wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL);
2786
2787 if (wr_id == RDMA_WRID_NONE) {
2788 break;
2789 }
2790 }
2791
2792 return RAM_SAVE_CONTROL_DELAYED-2000;
2793err:
2794 rdma->error_state = ret;
2795 return ret;
2796}
2797
2798static int qemu_rdma_accept(RDMAContext *rdma)
2799{
2800 RDMACapabilities cap;
2801 struct rdma_conn_param conn_param = {
2802 .responder_resources = 2,
2803 .private_data = &cap,
2804 .private_data_len = sizeof(cap),
2805 };
2806 struct rdma_cm_event *cm_event;
2807 struct ibv_context *verbs;
2808 int ret = -EINVAL22;
2809 int idx;
2810
2811 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2812 if (ret) {
2813 goto err_rdma_dest_wait;
2814 }
2815
2816 if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) {
2817 rdma_ack_cm_event(cm_event);
2818 goto err_rdma_dest_wait;
2819 }
2820
2821 memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap));
2822
2823 network_to_caps(&cap);
2824
2825 if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT1) {
2826 fprintf(stderrstderr, "Unknown source RDMA version: %d, bailing...\n",
2827 cap.version);
2828 rdma_ack_cm_event(cm_event);
2829 goto err_rdma_dest_wait;
2830 }
2831
2832 /*
2833 * Respond with only the capabilities this version of QEMU knows about.
2834 */
2835 cap.flags &= known_capabilities;
2836
2837 /*
2838 * Enable the ones that we do know about.
2839 * Add other checks here as new ones are introduced.
2840 */
2841 if (cap.flags & RDMA_CAPABILITY_PIN_ALL0x01) {
2842 rdma->pin_all = true1;
2843 }
2844
2845 rdma->cm_id = cm_event->id;
2846 verbs = cm_event->id->verbs;
2847
2848 rdma_ack_cm_event(cm_event);
2849
2850 DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled")do { } while (0);
2851
2852 caps_to_network(&cap);
2853
2854 DPRINTF("verbs context after listen: %p\n", verbs)do { } while (0);
2855
2856 if (!rdma->verbs) {
2857 rdma->verbs = verbs;
2858 } else if (rdma->verbs != verbs) {
2859 fprintf(stderrstderr, "ibv context not matching %p, %p!\n",
2860 rdma->verbs, verbs);
2861 goto err_rdma_dest_wait;
2862 }
2863
2864 qemu_rdma_dump_id("dest_init", verbs);
2865
2866 ret = qemu_rdma_alloc_pd_cq(rdma);
2867 if (ret) {
2868 fprintf(stderrstderr, "rdma migration: error allocating pd and cq!\n");
2869 goto err_rdma_dest_wait;
2870 }
2871
2872 ret = qemu_rdma_alloc_qp(rdma);
2873 if (ret) {
2874 fprintf(stderrstderr, "rdma migration: error allocating qp!\n");
2875 goto err_rdma_dest_wait;
2876 }
2877
2878 ret = qemu_rdma_init_ram_blocks(rdma);
2879 if (ret) {
2880 fprintf(stderrstderr, "rdma migration: error initializing ram blocks!\n");
2881 goto err_rdma_dest_wait;
2882 }
2883
2884 for (idx = 0; idx < RDMA_WRID_MAX; idx++) {
2885 ret = qemu_rdma_reg_control(rdma, idx);
2886 if (ret) {
2887 fprintf(stderrstderr, "rdma: error registering %d control!\n", idx);
2888 goto err_rdma_dest_wait;
2889 }
2890 }
2891
2892 qemu_set_fd_handler2(rdma->channel->fd, NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0));
2893
2894 ret = rdma_accept(rdma->cm_id, &conn_param);
2895 if (ret) {
2896 fprintf(stderrstderr, "rdma_accept returns %d!\n", ret);
2897 goto err_rdma_dest_wait;
2898 }
2899
2900 ret = rdma_get_cm_event(rdma->channel, &cm_event);
2901 if (ret) {
2902 fprintf(stderrstderr, "rdma_accept get_cm_event failed %d!\n", ret);
2903 goto err_rdma_dest_wait;
2904 }
2905
2906 if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) {
2907 fprintf(stderrstderr, "rdma_accept not event established!\n");
2908 rdma_ack_cm_event(cm_event);
2909 goto err_rdma_dest_wait;
2910 }
2911
2912 rdma_ack_cm_event(cm_event);
2913 rdma->connected = true1;
2914
2915 ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY);
2916 if (ret) {
2917 fprintf(stderrstderr, "rdma migration: error posting second control recv!\n");
2918 goto err_rdma_dest_wait;
2919 }
2920
2921 qemu_rdma_dump_gid("dest_connect", rdma->cm_id);
2922
2923 return 0;
2924
2925err_rdma_dest_wait:
2926 rdma->error_state = ret;
2927 qemu_rdma_cleanup(rdma);
2928 return ret;
2929}
2930
2931/*
2932 * During each iteration of the migration, we listen for instructions
2933 * by the source VM to perform dynamic page registrations before they
2934 * can perform RDMA operations.
2935 *
2936 * We respond with the 'rkey'.
2937 *
2938 * Keep doing this until the source tells us to stop.
2939 */
2940static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque,
2941 uint64_t flags)
2942{
2943 RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult),
2944 .type = RDMA_CONTROL_REGISTER_RESULT,
2945 .repeat = 0,
2946 };
2947 RDMAControlHeader unreg_resp = { .len = 0,
2948 .type = RDMA_CONTROL_UNREGISTER_FINISHED,
2949 .repeat = 0,
2950 };
2951 RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT,
2952 .repeat = 1 };
2953 QEMUFileRDMA *rfile = opaque;
2954 RDMAContext *rdma = rfile->rdma;
2955 RDMALocalBlocks *local = &rdma->local_ram_blocks;
2956 RDMAControlHeader head;
2957 RDMARegister *reg, *registers;
2958 RDMACompress *comp;
2959 RDMARegisterResult *reg_result;
2960 static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096];
2961 RDMALocalBlock *block;
2962 void *host_addr;
2963 int ret = 0;
2964 int idx = 0;
2965 int count = 0;
2966 int i = 0;
2967
2968 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
2969
2970 do {
2971 DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags)do { } while (0);
2972
2973 ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE);
2974
2975 if (ret < 0) {
2976 break;
2977 }
2978
2979 if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096) {
2980 fprintf(stderrstderr, "rdma: Too many requests in this message (%d)."
2981 "Bailing.\n", head.repeat);
2982 ret = -EIO5;
2983 break;
2984 }
2985
2986 switch (head.type) {
2987 case RDMA_CONTROL_COMPRESS:
2988 comp = (RDMACompress *) rdma->wr_data[idx].control_curr;
2989 network_to_compress(comp);
2990
2991 DDPRINTF("Zapping zero chunk: %" PRId64do { } while (0)
2992 " bytes, index %d, offset %" PRId64 "\n",do { } while (0)
2993 comp->length, comp->block_idx, comp->offset)do { } while (0);
2994 block = &(rdma->local_ram_blocks.block[comp->block_idx]);
2995
2996 host_addr = block->local_host_addr +
2997 (comp->offset - block->offset);
2998
2999 ram_handle_compressed(host_addr, comp->value, comp->length);
3000 break;
3001
3002 case RDMA_CONTROL_REGISTER_FINISHED:
3003 DDDPRINTF("Current registrations complete.\n")do { } while (0);
3004 goto out;
3005
3006 case RDMA_CONTROL_RAM_BLOCKS_REQUEST:
3007 DPRINTF("Initial setup info requested.\n")do { } while (0);
3008
3009 if (rdma->pin_all) {
3010 ret = qemu_rdma_reg_whole_ram_blocks(rdma);
3011 if (ret) {
3012 fprintf(stderrstderr, "rdma migration: error dest "
3013 "registering ram blocks!\n");
3014 goto out;
3015 }
3016 }
3017
3018 /*
3019 * Dest uses this to prepare to transmit the RAMBlock descriptions
3020 * to the source VM after connection setup.
3021 * Both sides use the "remote" structure to communicate and update
3022 * their "local" descriptions with what was sent.
3023 */
3024 for (i = 0; i < local->nb_blocks; i++) {
3025 rdma->block[i].remote_host_addr =
3026 (uint64_t)(local->block[i].local_host_addr);
3027
3028 if (rdma->pin_all) {
3029 rdma->block[i].remote_rkey = local->block[i].mr->rkey;
3030 }
3031
3032 rdma->block[i].offset = local->block[i].offset;
3033 rdma->block[i].length = local->block[i].length;
3034
3035 remote_block_to_network(&rdma->block[i]);
3036 }
3037
3038 blocks.len = rdma->local_ram_blocks.nb_blocks
3039 * sizeof(RDMARemoteBlock);
3040
3041
3042 ret = qemu_rdma_post_send_control(rdma,
3043 (uint8_t *) rdma->block, &blocks);
3044
3045 if (ret < 0) {
3046 fprintf(stderrstderr, "rdma migration: error sending remote info!\n");
3047 goto out;
3048 }
3049
3050 break;
3051 case RDMA_CONTROL_REGISTER_REQUEST:
3052 DDPRINTF("There are %d registration requests\n", head.repeat)do { } while (0);
3053
3054 reg_resp.repeat = head.repeat;
3055 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3056
3057 for (count = 0; count < head.repeat; count++) {
3058 uint64_t chunk;
3059 uint8_t *chunk_start, *chunk_end;
3060
3061 reg = &registers[count];
3062 network_to_register(reg);
3063
3064 reg_result = &results[count];
3065
3066 DDPRINTF("Registration request (%d): index %d, current_addr %"do { } while (0)
3067 PRIu64 " chunks: %" PRIu64 "\n", count,do { } while (0)
3068 reg->current_index, reg->key.current_addr, reg->chunks)do { } while (0);
3069
3070 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3071 if (block->is_ram_block) {
3072 host_addr = (block->local_host_addr +
3073 (reg->key.current_addr - block->offset));
3074 chunk = ram_chunk_index(block->local_host_addr,
3075 (uint8_t *) host_addr);
3076 } else {
3077 chunk = reg->key.chunk;
3078 host_addr = block->local_host_addr +
3079 (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT20));
3080 }
3081 chunk_start = ram_chunk_start(block, chunk);
3082 chunk_end = ram_chunk_end(block, chunk + reg->chunks);
3083 if (qemu_rdma_register_and_get_keys(rdma, block,
3084 (uint8_t *)host_addr, NULL((void*)0), &reg_result->rkey,
3085 chunk, chunk_start, chunk_end)) {
3086 fprintf(stderrstderr, "cannot get rkey!\n");
3087 ret = -EINVAL22;
3088 goto out;
3089 }
3090
3091 reg_result->host_addr = (uint64_t) block->local_host_addr;
3092
3093 DDPRINTF("Registered rkey for this request: %x\n",do { } while (0)
3094 reg_result->rkey)do { } while (0);
3095
3096 result_to_network(reg_result);
3097 }
3098
3099 ret = qemu_rdma_post_send_control(rdma,
3100 (uint8_t *) results, &reg_resp);
3101
3102 if (ret < 0) {
3103 fprintf(stderrstderr, "Failed to send control buffer!\n");
3104 goto out;
3105 }
3106 break;
3107 case RDMA_CONTROL_UNREGISTER_REQUEST:
3108 DDPRINTF("There are %d unregistration requests\n", head.repeat)do { } while (0);
3109 unreg_resp.repeat = head.repeat;
3110 registers = (RDMARegister *) rdma->wr_data[idx].control_curr;
3111
3112 for (count = 0; count < head.repeat; count++) {
3113 reg = &registers[count];
3114 network_to_register(reg);
3115
3116 DDPRINTF("Unregistration request (%d): "do { } while (0)
3117 " index %d, chunk %" PRIu64 "\n",do { } while (0)
3118 count, reg->current_index, reg->key.chunk)do { } while (0);
3119
3120 block = &(rdma->local_ram_blocks.block[reg->current_index]);
3121
3122 ret = ibv_dereg_mr(block->pmr[reg->key.chunk]);
3123 block->pmr[reg->key.chunk] = NULL((void*)0);
3124
3125 if (ret != 0) {
3126 perror("rdma unregistration chunk failed");
3127 ret = -ret;
3128 goto out;
3129 }
3130
3131 rdma->total_registrations--;
3132
3133 DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",do { } while (0)
3134 reg->key.chunk)do { } while (0);
3135 }
3136
3137 ret = qemu_rdma_post_send_control(rdma, NULL((void*)0), &unreg_resp);
3138
3139 if (ret < 0) {
3140 fprintf(stderrstderr, "Failed to send control buffer!\n");
3141 goto out;
3142 }
3143 break;
3144 case RDMA_CONTROL_REGISTER_RESULT:
3145 fprintf(stderrstderr, "Invalid RESULT message at dest.\n");
3146 ret = -EIO5;
3147 goto out;
3148 default:
3149 fprintf(stderrstderr, "Unknown control message %s\n",
3150 control_desc[head.type]);
3151 ret = -EIO5;
3152 goto out;
3153 }
3154 } while (1);
3155out:
3156 if (ret < 0) {
3157 rdma->error_state = ret;
3158 }
3159 return ret;
3160}
3161
3162static int qemu_rdma_registration_start(QEMUFile *f, void *opaque,
3163 uint64_t flags)
3164{
3165 QEMUFileRDMA *rfile = opaque;
3166 RDMAContext *rdma = rfile->rdma;
3167
3168 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
3169
3170 DDDPRINTF("start section: %" PRIu64 "\n", flags)do { } while (0);
3171 qemu_put_be64(f, RAM_SAVE_FLAG_HOOK0x80);
3172 qemu_fflush(f);
3173
3174 return 0;
3175}
3176
3177/*
3178 * Inform dest that dynamic registrations are done for now.
3179 * First, flush writes, if any.
3180 */
3181static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque,
3182 uint64_t flags)
3183{
3184 Error *local_err = NULL((void*)0), **errp = &local_err;
3185 QEMUFileRDMA *rfile = opaque;
3186 RDMAContext *rdma = rfile->rdma;
3187 RDMAControlHeader head = { .len = 0, .repeat = 1 };
3188 int ret = 0;
3189
3190 CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported
) { fprintf(stderr, "RDMA is in an error state waiting migration"
" to abort!\n"); rdma->error_reported = 1; } return rdma->
error_state; } } while (0);
;
3191
3192 qemu_fflush(f);
3193 ret = qemu_rdma_drain_cq(f, rdma);
3194
3195 if (ret < 0) {
3196 goto err;
3197 }
3198
3199 if (flags == RAM_CONTROL_SETUP0) {
3200 RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT };
3201 RDMALocalBlocks *local = &rdma->local_ram_blocks;
3202 int reg_result_idx, i, j, nb_remote_blocks;
3203
3204 head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST;
3205 DPRINTF("Sending registration setup for ram blocks...\n")do { } while (0);
3206
3207 /*
3208 * Make sure that we parallelize the pinning on both sides.
3209 * For very large guests, doing this serially takes a really
3210 * long time, so we have to 'interleave' the pinning locally
3211 * with the control messages by performing the pinning on this
3212 * side before we receive the control response from the other
3213 * side that the pinning has completed.
3214 */
3215 ret = qemu_rdma_exchange_send(rdma, &head, NULL((void*)0), &resp,
3216 &reg_result_idx, rdma->pin_all ?
3217 qemu_rdma_reg_whole_ram_blocks : NULL((void*)0));
3218 if (ret < 0) {
3219 ERROR(errp, "receiving remote info!")do { fprintf(stderr, "RDMA ERROR: " "receiving remote info!" "\n"
); if (errp && (*(errp) == ((void*)0))) { error_set(errp
, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "receiving remote info!"
); } } while (0)
;
3220 return ret;
3221 }
3222
3223 nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock);
3224
3225 /*
3226 * The protocol uses two different sets of rkeys (mutually exclusive):
3227 * 1. One key to represent the virtual address of the entire ram block.
3228 * (dynamic chunk registration disabled - pin everything with one rkey.)
3229 * 2. One to represent individual chunks within a ram block.
3230 * (dynamic chunk registration enabled - pin individual chunks.)
3231 *
3232 * Once the capability is successfully negotiated, the destination transmits
3233 * the keys to use (or sends them later) including the virtual addresses
3234 * and then propagates the remote ram block descriptions to his local copy.
3235 */
3236
3237 if (local->nb_blocks != nb_remote_blocks) {
3238 ERROR(errp, "ram blocks mismatch #1! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3239 "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3240 "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
;
3241 return -EINVAL22;
3242 }
3243
3244 qemu_rdma_move_header(rdma, reg_result_idx, &resp);
3245 memcpy(rdma->block,
3246 rdma->wr_data[reg_result_idx].control_curr, resp.len);
3247 for (i = 0; i < nb_remote_blocks; i++) {
3248 network_to_remote_block(&rdma->block[i]);
3249
3250 /* search local ram blocks */
3251 for (j = 0; j < local->nb_blocks; j++) {
3252 if (rdma->block[i].offset != local->block[j].offset) {
3253 continue;
3254 }
3255
3256 if (rdma->block[i].length != local->block[j].length) {
3257 ERROR(errp, "ram blocks mismatch #2! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3258 "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3259 "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
;
3260 return -EINVAL22;
3261 }
3262 local->block[j].remote_host_addr =
3263 rdma->block[i].remote_host_addr;
3264 local->block[j].remote_rkey = rdma->block[i].remote_rkey;
3265 break;
3266 }
3267
3268 if (j >= local->nb_blocks) {
3269 ERROR(errp, "ram blocks mismatch #3! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3270 "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
3271 "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! "
"Your QEMU command line parameters are probably " "not identical on both the source and destination."
); } } while (0)
;
3272 return -EINVAL22;
3273 }
3274 }
3275 }
3276
3277 DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags)do { } while (0);
3278
3279 head.type = RDMA_CONTROL_REGISTER_FINISHED;
3280 ret = qemu_rdma_exchange_send(rdma, &head, NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0));
3281
3282 if (ret < 0) {
3283 goto err;
3284 }
3285
3286 return 0;
3287err:
3288 rdma->error_state = ret;
3289 return ret;
3290}
3291
3292static int qemu_rdma_get_fd(void *opaque)
3293{
3294 QEMUFileRDMA *rfile = opaque;
3295 RDMAContext *rdma = rfile->rdma;
3296
3297 return rdma->comp_channel->fd;
3298}
3299
3300const QEMUFileOps rdma_read_ops = {
3301 .get_buffer = qemu_rdma_get_buffer,
3302 .get_fd = qemu_rdma_get_fd,
3303 .close = qemu_rdma_close,
3304 .hook_ram_load = qemu_rdma_registration_handle,
3305};
3306
3307const QEMUFileOps rdma_write_ops = {
3308 .put_buffer = qemu_rdma_put_buffer,
3309 .close = qemu_rdma_close,
3310 .before_ram_iterate = qemu_rdma_registration_start,
3311 .after_ram_iterate = qemu_rdma_registration_stop,
3312 .save_page = qemu_rdma_save_page,
3313};
3314
3315static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode)
3316{
3317 QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA));
3318
3319 if (qemu_file_mode_is_not_valid(mode)) {
3320 return NULL((void*)0);
3321 }
3322
3323 r->rdma = rdma;
3324
3325 if (mode[0] == 'w') {
3326 r->file = qemu_fopen_ops(r, &rdma_write_ops);
3327 } else {
3328 r->file = qemu_fopen_ops(r, &rdma_read_ops);
3329 }
3330
3331 return r->file;
3332}
3333
3334static void rdma_accept_incoming_migration(void *opaque)
3335{
3336 RDMAContext *rdma = opaque;
3337 int ret;
3338 QEMUFile *f;
3339 Error *local_err = NULL((void*)0), **errp = &local_err;
3340
3341 DPRINTF("Accepting rdma connection...\n")do { } while (0);
3342 ret = qemu_rdma_accept(rdma);
3343
3344 if (ret) {
3345 ERROR(errp, "RDMA Migration initialization failed!")do { fprintf(stderr, "RDMA ERROR: " "RDMA Migration initialization failed!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA Migration initialization failed!"
); } } while (0)
;
3346 return;
3347 }
3348
3349 DPRINTF("Accepted migration\n")do { } while (0);
3350
3351 f = qemu_fopen_rdma(rdma, "rb");
3352 if (f == NULL((void*)0)) {
3353 ERROR(errp, "could not qemu_fopen_rdma!")do { fprintf(stderr, "RDMA ERROR: " "could not qemu_fopen_rdma!"
"\n"); if (errp && (*(errp) == ((void*)0))) { error_set
(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not qemu_fopen_rdma!"
); } } while (0)
;
3354 qemu_rdma_cleanup(rdma);
3355 return;
3356 }
3357
3358 rdma->migration_started_on_destination = 1;
3359 process_incoming_migration(f);
3360}
3361
3362void rdma_start_incoming_migration(const char *host_port, Error **errp)
3363{
3364 int ret;
3365 RDMAContext *rdma;
3366 Error *local_err = NULL((void*)0);
3367
3368 DPRINTF("Starting RDMA-based incoming migration\n")do { } while (0);
3369 rdma = qemu_rdma_data_init(host_port, &local_err);
3370
3371 if (rdma == NULL((void*)0)) {
3372 goto err;
3373 }
3374
3375 ret = qemu_rdma_dest_init(rdma, &local_err);
3376
3377 if (ret) {
3378 goto err;
3379 }
3380
3381 DPRINTF("qemu_rdma_dest_init success\n")do { } while (0);
3382
3383 ret = rdma_listen(rdma->listen_id, 5);
3384
3385 if (ret) {
3386 ERROR(errp, "listening on socket!")do { fprintf(stderr, "RDMA ERROR: " "listening on socket!" "\n"
); if (errp && (*(errp) == ((void*)0))) { error_set(errp
, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "listening on socket!"
); } } while (0)
;
3387 goto err;
3388 }
3389
3390 DPRINTF("rdma_listen success\n")do { } while (0);
3391
3392 qemu_set_fd_handler2(rdma->channel->fd, NULL((void*)0),
3393 rdma_accept_incoming_migration, NULL((void*)0),
3394 (void *)(intptr_t) rdma);
3395 return;
3396err:
3397 error_propagate(errp, local_err);
3398 g_free(rdma);
3399}
3400
3401void rdma_start_outgoing_migration(void *opaque,
3402 const char *host_port, Error **errp)
3403{
3404 MigrationState *s = opaque;
3405 Error *local_err = NULL((void*)0), **temp = &local_err;
3406 RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err);
3407 int ret = 0;
3408
3409 if (rdma == NULL((void*)0)) {
1
Taking false branch
3410 ERROR(temp, "Failed to initialize RDMA data structures! %d", ret)do { fprintf(stderr, "RDMA ERROR: " "Failed to initialize RDMA data structures! %d"
"\n", ret); if (temp && (*(temp) == ((void*)0))) { error_set
(temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Failed to initialize RDMA data structures! %d"
, ret); } } while (0)
;
3411 goto err;
3412 }
3413
3414 ret = qemu_rdma_source_init(rdma, &local_err,
2
Calling 'qemu_rdma_source_init'
3415 s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]);
3416
3417 if (ret) {
3418 goto err;
3419 }
3420
3421 DPRINTF("qemu_rdma_source_init success\n")do { } while (0);
3422 ret = qemu_rdma_connect(rdma, &local_err);
3423
3424 if (ret) {
3425 goto err;
3426 }
3427
3428 DPRINTF("qemu_rdma_source_connect success\n")do { } while (0);
3429
3430 s->file = qemu_fopen_rdma(rdma, "wb");
3431 migrate_fd_connect(s);
3432 return;
3433err:
3434 error_propagate(errp, local_err);
3435 g_free(rdma);
3436 migrate_fd_error(s);
3437}