File: | migration-rdma.c |
Location: | line 2236, column 21 |
Description: | Access to field 'offset' results in a dereference of a null pointer (loaded from field 'block') |
1 | /* | |||
2 | * RDMA protocol and interfaces | |||
3 | * | |||
4 | * Copyright IBM, Corp. 2010-2013 | |||
5 | * | |||
6 | * Authors: | |||
7 | * Michael R. Hines <mrhines@us.ibm.com> | |||
8 | * Jiuxing Liu <jl@us.ibm.com> | |||
9 | * | |||
10 | * This work is licensed under the terms of the GNU GPL, version 2 or | |||
11 | * later. See the COPYING file in the top-level directory. | |||
12 | * | |||
13 | */ | |||
14 | #include "qemu-common.h" | |||
15 | #include "migration/migration.h" | |||
16 | #include "migration/qemu-file.h" | |||
17 | #include "exec/cpu-common.h" | |||
18 | #include "qemu/main-loop.h" | |||
19 | #include "qemu/sockets.h" | |||
20 | #include "qemu/bitmap.h" | |||
21 | #include "block/coroutine.h" | |||
22 | #include <stdio.h> | |||
23 | #include <sys/types.h> | |||
24 | #include <sys/socket.h> | |||
25 | #include <netdb.h> | |||
26 | #include <arpa/inet.h> | |||
27 | #include <string.h> | |||
28 | #include <rdma/rdma_cma.h> | |||
29 | ||||
30 | //#define DEBUG_RDMA | |||
31 | //#define DEBUG_RDMA_VERBOSE | |||
32 | //#define DEBUG_RDMA_REALLY_VERBOSE | |||
33 | ||||
34 | #ifdef DEBUG_RDMA | |||
35 | #define DPRINTF(fmt, ...)do { } while (0) \ | |||
36 | do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) | |||
37 | #else | |||
38 | #define DPRINTF(fmt, ...)do { } while (0) \ | |||
39 | do { } while (0) | |||
40 | #endif | |||
41 | ||||
42 | #ifdef DEBUG_RDMA_VERBOSE | |||
43 | #define DDPRINTF(fmt, ...)do { } while (0) \ | |||
44 | do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) | |||
45 | #else | |||
46 | #define DDPRINTF(fmt, ...)do { } while (0) \ | |||
47 | do { } while (0) | |||
48 | #endif | |||
49 | ||||
50 | #ifdef DEBUG_RDMA_REALLY_VERBOSE | |||
51 | #define DDDPRINTF(fmt, ...)do { } while (0) \ | |||
52 | do { printf("rdma: " fmt, ## __VA_ARGS__); } while (0) | |||
53 | #else | |||
54 | #define DDDPRINTF(fmt, ...)do { } while (0) \ | |||
55 | do { } while (0) | |||
56 | #endif | |||
57 | ||||
58 | /* | |||
59 | * Print and error on both the Monitor and the Log file. | |||
60 | */ | |||
61 | #define ERROR(errp, fmt, ...)do { fprintf(stderr, "RDMA ERROR: " fmt "\n", ...); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " fmt, ...); } } while (0) \ | |||
62 | do { \ | |||
63 | fprintf(stderrstderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \ | |||
64 | if (errp && (*(errp) == NULL((void*)0))) { \ | |||
65 | error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__)error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " fmt , ## __VA_ARGS__); \ | |||
66 | } \ | |||
67 | } while (0) | |||
68 | ||||
69 | #define RDMA_RESOLVE_TIMEOUT_MS10000 10000 | |||
70 | ||||
71 | /* Do not merge data if larger than this. */ | |||
72 | #define RDMA_MERGE_MAX(2 * 1024 * 1024) (2 * 1024 * 1024) | |||
73 | #define RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096) (RDMA_MERGE_MAX(2 * 1024 * 1024) / 4096) | |||
74 | ||||
75 | #define RDMA_REG_CHUNK_SHIFT20 20 /* 1 MB */ | |||
76 | ||||
77 | /* | |||
78 | * This is only for non-live state being migrated. | |||
79 | * Instead of RDMA_WRITE messages, we use RDMA_SEND | |||
80 | * messages for that state, which requires a different | |||
81 | * delivery design than main memory. | |||
82 | */ | |||
83 | #define RDMA_SEND_INCREMENT32768 32768 | |||
84 | ||||
85 | /* | |||
86 | * Maximum size infiniband SEND message | |||
87 | */ | |||
88 | #define RDMA_CONTROL_MAX_BUFFER(512 * 1024) (512 * 1024) | |||
89 | #define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096 4096 | |||
90 | ||||
91 | #define RDMA_CONTROL_VERSION_CURRENT1 1 | |||
92 | /* | |||
93 | * Capabilities for negotiation. | |||
94 | */ | |||
95 | #define RDMA_CAPABILITY_PIN_ALL0x01 0x01 | |||
96 | ||||
97 | /* | |||
98 | * Add the other flags above to this list of known capabilities | |||
99 | * as they are introduced. | |||
100 | */ | |||
101 | static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL0x01; | |||
102 | ||||
103 | #define CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0); \ | |||
104 | do { \ | |||
105 | if (rdma->error_state) { \ | |||
106 | if (!rdma->error_reported) { \ | |||
107 | fprintf(stderrstderr, "RDMA is in an error state waiting migration" \ | |||
108 | " to abort!\n"); \ | |||
109 | rdma->error_reported = 1; \ | |||
110 | } \ | |||
111 | return rdma->error_state; \ | |||
112 | } \ | |||
113 | } while (0); | |||
114 | ||||
115 | /* | |||
116 | * A work request ID is 64-bits and we split up these bits | |||
117 | * into 3 parts: | |||
118 | * | |||
119 | * bits 0-15 : type of control message, 2^16 | |||
120 | * bits 16-29: ram block index, 2^14 | |||
121 | * bits 30-63: ram block chunk number, 2^34 | |||
122 | * | |||
123 | * The last two bit ranges are only used for RDMA writes, | |||
124 | * in order to track their completion and potentially | |||
125 | * also track unregistration status of the message. | |||
126 | */ | |||
127 | #define RDMA_WRID_TYPE_SHIFT0UL 0UL | |||
128 | #define RDMA_WRID_BLOCK_SHIFT16UL 16UL | |||
129 | #define RDMA_WRID_CHUNK_SHIFT30UL 30UL | |||
130 | ||||
131 | #define RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL) \ | |||
132 | ((1UL << RDMA_WRID_BLOCK_SHIFT16UL) - 1UL) | |||
133 | ||||
134 | #define RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL )) \ | |||
135 | (~RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL) & ((1UL << RDMA_WRID_CHUNK_SHIFT30UL) - 1UL)) | |||
136 | ||||
137 | #define RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL )) & ~((1UL << 16UL) - 1UL)) (~RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL )) & ~RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL)) | |||
138 | ||||
139 | /* | |||
140 | * RDMA migration protocol: | |||
141 | * 1. RDMA Writes (data messages, i.e. RAM) | |||
142 | * 2. IB Send/Recv (control channel messages) | |||
143 | */ | |||
144 | enum { | |||
145 | RDMA_WRID_NONE = 0, | |||
146 | RDMA_WRID_RDMA_WRITE = 1, | |||
147 | RDMA_WRID_SEND_CONTROL = 2000, | |||
148 | RDMA_WRID_RECV_CONTROL = 4000, | |||
149 | }; | |||
150 | ||||
151 | const char *wrid_desc[] = { | |||
152 | [RDMA_WRID_NONE] = "NONE", | |||
153 | [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA", | |||
154 | [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND", | |||
155 | [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV", | |||
156 | }; | |||
157 | ||||
158 | /* | |||
159 | * Work request IDs for IB SEND messages only (not RDMA writes). | |||
160 | * This is used by the migration protocol to transmit | |||
161 | * control messages (such as device state and registration commands) | |||
162 | * | |||
163 | * We could use more WRs, but we have enough for now. | |||
164 | */ | |||
165 | enum { | |||
166 | RDMA_WRID_READY = 0, | |||
167 | RDMA_WRID_DATA, | |||
168 | RDMA_WRID_CONTROL, | |||
169 | RDMA_WRID_MAX, | |||
170 | }; | |||
171 | ||||
172 | /* | |||
173 | * SEND/RECV IB Control Messages. | |||
174 | */ | |||
175 | enum { | |||
176 | RDMA_CONTROL_NONE = 0, | |||
177 | RDMA_CONTROL_ERROR, | |||
178 | RDMA_CONTROL_READY, /* ready to receive */ | |||
179 | RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */ | |||
180 | RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */ | |||
181 | RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */ | |||
182 | RDMA_CONTROL_COMPRESS, /* page contains repeat values */ | |||
183 | RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */ | |||
184 | RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */ | |||
185 | RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */ | |||
186 | RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */ | |||
187 | RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */ | |||
188 | }; | |||
189 | ||||
190 | const char *control_desc[] = { | |||
191 | [RDMA_CONTROL_NONE] = "NONE", | |||
192 | [RDMA_CONTROL_ERROR] = "ERROR", | |||
193 | [RDMA_CONTROL_READY] = "READY", | |||
194 | [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE", | |||
195 | [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST", | |||
196 | [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT", | |||
197 | [RDMA_CONTROL_COMPRESS] = "COMPRESS", | |||
198 | [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST", | |||
199 | [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT", | |||
200 | [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED", | |||
201 | [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST", | |||
202 | [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED", | |||
203 | }; | |||
204 | ||||
205 | /* | |||
206 | * Memory and MR structures used to represent an IB Send/Recv work request. | |||
207 | * This is *not* used for RDMA writes, only IB Send/Recv. | |||
208 | */ | |||
209 | typedef struct { | |||
210 | uint8_t control[RDMA_CONTROL_MAX_BUFFER(512 * 1024)]; /* actual buffer to register */ | |||
211 | struct ibv_mr *control_mr; /* registration metadata */ | |||
212 | size_t control_len; /* length of the message */ | |||
213 | uint8_t *control_curr; /* start of unconsumed bytes */ | |||
214 | } RDMAWorkRequestData; | |||
215 | ||||
216 | /* | |||
217 | * Negotiate RDMA capabilities during connection-setup time. | |||
218 | */ | |||
219 | typedef struct { | |||
220 | uint32_t version; | |||
221 | uint32_t flags; | |||
222 | } RDMACapabilities; | |||
223 | ||||
224 | static void caps_to_network(RDMACapabilities *cap) | |||
225 | { | |||
226 | cap->version = htonl(cap->version); | |||
227 | cap->flags = htonl(cap->flags); | |||
228 | } | |||
229 | ||||
230 | static void network_to_caps(RDMACapabilities *cap) | |||
231 | { | |||
232 | cap->version = ntohl(cap->version); | |||
233 | cap->flags = ntohl(cap->flags); | |||
234 | } | |||
235 | ||||
236 | /* | |||
237 | * Representation of a RAMBlock from an RDMA perspective. | |||
238 | * This is not transmitted, only local. | |||
239 | * This and subsequent structures cannot be linked lists | |||
240 | * because we're using a single IB message to transmit | |||
241 | * the information. It's small anyway, so a list is overkill. | |||
242 | */ | |||
243 | typedef struct RDMALocalBlock { | |||
244 | uint8_t *local_host_addr; /* local virtual address */ | |||
245 | uint64_t remote_host_addr; /* remote virtual address */ | |||
246 | uint64_t offset; | |||
247 | uint64_t length; | |||
248 | struct ibv_mr **pmr; /* MRs for chunk-level registration */ | |||
249 | struct ibv_mr *mr; /* MR for non-chunk-level registration */ | |||
250 | uint32_t *remote_keys; /* rkeys for chunk-level registration */ | |||
251 | uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ | |||
252 | int index; /* which block are we */ | |||
253 | bool_Bool is_ram_block; | |||
254 | int nb_chunks; | |||
255 | unsigned long *transit_bitmap; | |||
256 | unsigned long *unregister_bitmap; | |||
257 | } RDMALocalBlock; | |||
258 | ||||
259 | /* | |||
260 | * Also represents a RAMblock, but only on the dest. | |||
261 | * This gets transmitted by the dest during connection-time | |||
262 | * to the source VM and then is used to populate the | |||
263 | * corresponding RDMALocalBlock with | |||
264 | * the information needed to perform the actual RDMA. | |||
265 | */ | |||
266 | typedef struct QEMU_PACKED__attribute__((packed)) RDMARemoteBlock { | |||
267 | uint64_t remote_host_addr; | |||
268 | uint64_t offset; | |||
269 | uint64_t length; | |||
270 | uint32_t remote_rkey; | |||
271 | uint32_t padding; | |||
272 | } RDMARemoteBlock; | |||
273 | ||||
274 | static uint64_t htonll(uint64_t v) | |||
275 | { | |||
276 | union { uint32_t lv[2]; uint64_t llv; } u; | |||
277 | u.lv[0] = htonl(v >> 32); | |||
278 | u.lv[1] = htonl(v & 0xFFFFFFFFULL); | |||
279 | return u.llv; | |||
280 | } | |||
281 | ||||
282 | static uint64_t ntohll(uint64_t v) { | |||
283 | union { uint32_t lv[2]; uint64_t llv; } u; | |||
284 | u.llv = v; | |||
285 | return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]); | |||
286 | } | |||
287 | ||||
288 | static void remote_block_to_network(RDMARemoteBlock *rb) | |||
289 | { | |||
290 | rb->remote_host_addr = htonll(rb->remote_host_addr); | |||
291 | rb->offset = htonll(rb->offset); | |||
292 | rb->length = htonll(rb->length); | |||
293 | rb->remote_rkey = htonl(rb->remote_rkey); | |||
294 | } | |||
295 | ||||
296 | static void network_to_remote_block(RDMARemoteBlock *rb) | |||
297 | { | |||
298 | rb->remote_host_addr = ntohll(rb->remote_host_addr); | |||
299 | rb->offset = ntohll(rb->offset); | |||
300 | rb->length = ntohll(rb->length); | |||
301 | rb->remote_rkey = ntohl(rb->remote_rkey); | |||
302 | } | |||
303 | ||||
304 | /* | |||
305 | * Virtual address of the above structures used for transmitting | |||
306 | * the RAMBlock descriptions at connection-time. | |||
307 | * This structure is *not* transmitted. | |||
308 | */ | |||
309 | typedef struct RDMALocalBlocks { | |||
310 | int nb_blocks; | |||
311 | bool_Bool init; /* main memory init complete */ | |||
312 | RDMALocalBlock *block; | |||
313 | } RDMALocalBlocks; | |||
314 | ||||
315 | /* | |||
316 | * Main data structure for RDMA state. | |||
317 | * While there is only one copy of this structure being allocated right now, | |||
318 | * this is the place where one would start if you wanted to consider | |||
319 | * having more than one RDMA connection open at the same time. | |||
320 | */ | |||
321 | typedef struct RDMAContext { | |||
322 | char *host; | |||
323 | int port; | |||
324 | ||||
325 | RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; | |||
326 | ||||
327 | /* | |||
328 | * This is used by *_exchange_send() to figure out whether or not | |||
329 | * the initial "READY" message has already been received or not. | |||
330 | * This is because other functions may potentially poll() and detect | |||
331 | * the READY message before send() does, in which case we need to | |||
332 | * know if it completed. | |||
333 | */ | |||
334 | int control_ready_expected; | |||
335 | ||||
336 | /* number of outstanding writes */ | |||
337 | int nb_sent; | |||
338 | ||||
339 | /* store info about current buffer so that we can | |||
340 | merge it with future sends */ | |||
341 | uint64_t current_addr; | |||
342 | uint64_t current_length; | |||
343 | /* index of ram block the current buffer belongs to */ | |||
344 | int current_index; | |||
345 | /* index of the chunk in the current ram block */ | |||
346 | int current_chunk; | |||
347 | ||||
348 | bool_Bool pin_all; | |||
349 | ||||
350 | /* | |||
351 | * infiniband-specific variables for opening the device | |||
352 | * and maintaining connection state and so forth. | |||
353 | * | |||
354 | * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in | |||
355 | * cm_id->verbs, cm_id->channel, and cm_id->qp. | |||
356 | */ | |||
357 | struct rdma_cm_id *cm_id; /* connection manager ID */ | |||
358 | struct rdma_cm_id *listen_id; | |||
359 | bool_Bool connected; | |||
360 | ||||
361 | struct ibv_context *verbs; | |||
362 | struct rdma_event_channel *channel; | |||
363 | struct ibv_qp *qp; /* queue pair */ | |||
364 | struct ibv_comp_channel *comp_channel; /* completion channel */ | |||
365 | struct ibv_pd *pd; /* protection domain */ | |||
366 | struct ibv_cq *cq; /* completion queue */ | |||
367 | ||||
368 | /* | |||
369 | * If a previous write failed (perhaps because of a failed | |||
370 | * memory registration, then do not attempt any future work | |||
371 | * and remember the error state. | |||
372 | */ | |||
373 | int error_state; | |||
374 | int error_reported; | |||
375 | ||||
376 | /* | |||
377 | * Description of ram blocks used throughout the code. | |||
378 | */ | |||
379 | RDMALocalBlocks local_ram_blocks; | |||
380 | RDMARemoteBlock *block; | |||
381 | ||||
382 | /* | |||
383 | * Migration on *destination* started. | |||
384 | * Then use coroutine yield function. | |||
385 | * Source runs in a thread, so we don't care. | |||
386 | */ | |||
387 | int migration_started_on_destination; | |||
388 | ||||
389 | int total_registrations; | |||
390 | int total_writes; | |||
391 | ||||
392 | int unregister_current, unregister_next; | |||
393 | uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)]; | |||
394 | ||||
395 | GHashTable *blockmap; | |||
396 | } RDMAContext; | |||
397 | ||||
398 | /* | |||
399 | * Interface to the rest of the migration call stack. | |||
400 | */ | |||
401 | typedef struct QEMUFileRDMA { | |||
402 | RDMAContext *rdma; | |||
403 | size_t len; | |||
404 | void *file; | |||
405 | } QEMUFileRDMA; | |||
406 | ||||
407 | /* | |||
408 | * Main structure for IB Send/Recv control messages. | |||
409 | * This gets prepended at the beginning of every Send/Recv. | |||
410 | */ | |||
411 | typedef struct QEMU_PACKED__attribute__((packed)) { | |||
412 | uint32_t len; /* Total length of data portion */ | |||
413 | uint32_t type; /* which control command to perform */ | |||
414 | uint32_t repeat; /* number of commands in data portion of same type */ | |||
415 | uint32_t padding; | |||
416 | } RDMAControlHeader; | |||
417 | ||||
418 | static void control_to_network(RDMAControlHeader *control) | |||
419 | { | |||
420 | control->type = htonl(control->type); | |||
421 | control->len = htonl(control->len); | |||
422 | control->repeat = htonl(control->repeat); | |||
423 | } | |||
424 | ||||
425 | static void network_to_control(RDMAControlHeader *control) | |||
426 | { | |||
427 | control->type = ntohl(control->type); | |||
428 | control->len = ntohl(control->len); | |||
429 | control->repeat = ntohl(control->repeat); | |||
430 | } | |||
431 | ||||
432 | /* | |||
433 | * Register a single Chunk. | |||
434 | * Information sent by the source VM to inform the dest | |||
435 | * to register an single chunk of memory before we can perform | |||
436 | * the actual RDMA operation. | |||
437 | */ | |||
438 | typedef struct QEMU_PACKED__attribute__((packed)) { | |||
439 | union QEMU_PACKED__attribute__((packed)) { | |||
440 | uint64_t current_addr; /* offset into the ramblock of the chunk */ | |||
441 | uint64_t chunk; /* chunk to lookup if unregistering */ | |||
442 | } key; | |||
443 | uint32_t current_index; /* which ramblock the chunk belongs to */ | |||
444 | uint32_t padding; | |||
445 | uint64_t chunks; /* how many sequential chunks to register */ | |||
446 | } RDMARegister; | |||
447 | ||||
448 | static void register_to_network(RDMARegister *reg) | |||
449 | { | |||
450 | reg->key.current_addr = htonll(reg->key.current_addr); | |||
451 | reg->current_index = htonl(reg->current_index); | |||
452 | reg->chunks = htonll(reg->chunks); | |||
453 | } | |||
454 | ||||
455 | static void network_to_register(RDMARegister *reg) | |||
456 | { | |||
457 | reg->key.current_addr = ntohll(reg->key.current_addr); | |||
458 | reg->current_index = ntohl(reg->current_index); | |||
459 | reg->chunks = ntohll(reg->chunks); | |||
460 | } | |||
461 | ||||
462 | typedef struct QEMU_PACKED__attribute__((packed)) { | |||
463 | uint32_t value; /* if zero, we will madvise() */ | |||
464 | uint32_t block_idx; /* which ram block index */ | |||
465 | uint64_t offset; /* where in the remote ramblock this chunk */ | |||
466 | uint64_t length; /* length of the chunk */ | |||
467 | } RDMACompress; | |||
468 | ||||
469 | static void compress_to_network(RDMACompress *comp) | |||
470 | { | |||
471 | comp->value = htonl(comp->value); | |||
472 | comp->block_idx = htonl(comp->block_idx); | |||
473 | comp->offset = htonll(comp->offset); | |||
474 | comp->length = htonll(comp->length); | |||
475 | } | |||
476 | ||||
477 | static void network_to_compress(RDMACompress *comp) | |||
478 | { | |||
479 | comp->value = ntohl(comp->value); | |||
480 | comp->block_idx = ntohl(comp->block_idx); | |||
481 | comp->offset = ntohll(comp->offset); | |||
482 | comp->length = ntohll(comp->length); | |||
483 | } | |||
484 | ||||
485 | /* | |||
486 | * The result of the dest's memory registration produces an "rkey" | |||
487 | * which the source VM must reference in order to perform | |||
488 | * the RDMA operation. | |||
489 | */ | |||
490 | typedef struct QEMU_PACKED__attribute__((packed)) { | |||
491 | uint32_t rkey; | |||
492 | uint32_t padding; | |||
493 | uint64_t host_addr; | |||
494 | } RDMARegisterResult; | |||
495 | ||||
496 | static void result_to_network(RDMARegisterResult *result) | |||
497 | { | |||
498 | result->rkey = htonl(result->rkey); | |||
499 | result->host_addr = htonll(result->host_addr); | |||
500 | }; | |||
501 | ||||
502 | static void network_to_result(RDMARegisterResult *result) | |||
503 | { | |||
504 | result->rkey = ntohl(result->rkey); | |||
505 | result->host_addr = ntohll(result->host_addr); | |||
506 | }; | |||
507 | ||||
508 | const char *print_wrid(int wrid); | |||
509 | static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, | |||
510 | uint8_t *data, RDMAControlHeader *resp, | |||
511 | int *resp_idx, | |||
512 | int (*callback)(RDMAContext *rdma)); | |||
513 | ||||
514 | static inline uint64_t ram_chunk_index(const uint8_t *start, | |||
515 | const uint8_t *host) | |||
516 | { | |||
517 | return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT20; | |||
518 | } | |||
519 | ||||
520 | static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block, | |||
521 | uint64_t i) | |||
522 | { | |||
523 | return (uint8_t *) (((uintptr_t) rdma_ram_block->local_host_addr) | |||
524 | + (i << RDMA_REG_CHUNK_SHIFT20)); | |||
525 | } | |||
526 | ||||
527 | static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, | |||
528 | uint64_t i) | |||
529 | { | |||
530 | uint8_t *result = ram_chunk_start(rdma_ram_block, i) + | |||
531 | (1UL << RDMA_REG_CHUNK_SHIFT20); | |||
532 | ||||
533 | if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) { | |||
534 | result = rdma_ram_block->local_host_addr + rdma_ram_block->length; | |||
535 | } | |||
536 | ||||
537 | return result; | |||
538 | } | |||
539 | ||||
540 | static int __qemu_rdma_add_block(RDMAContext *rdma, void *host_addr, | |||
541 | ram_addr_t block_offset, uint64_t length) | |||
542 | { | |||
543 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
544 | RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, | |||
545 | (void *) block_offset); | |||
546 | RDMALocalBlock *old = local->block; | |||
547 | ||||
548 | assert(block == NULL)((block == ((void*)0)) ? (void) (0) : __assert_fail ("block == ((void*)0)" , "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 548 , __PRETTY_FUNCTION__)); | |||
549 | ||||
550 | local->block = g_malloc0(sizeof(RDMALocalBlock) * (local->nb_blocks + 1)); | |||
551 | ||||
552 | if (local->nb_blocks) { | |||
553 | int x; | |||
554 | ||||
555 | for (x = 0; x < local->nb_blocks; x++) { | |||
556 | g_hash_table_remove(rdma->blockmap, (void *)old[x].offset); | |||
557 | g_hash_table_insert(rdma->blockmap, (void *)old[x].offset, | |||
558 | &local->block[x]); | |||
559 | } | |||
560 | memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); | |||
561 | g_free(old); | |||
562 | } | |||
563 | ||||
564 | block = &local->block[local->nb_blocks]; | |||
565 | ||||
566 | block->local_host_addr = host_addr; | |||
567 | block->offset = block_offset; | |||
568 | block->length = length; | |||
569 | block->index = local->nb_blocks; | |||
570 | block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; | |||
571 | block->transit_bitmap = bitmap_new(block->nb_chunks); | |||
572 | bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); | |||
573 | block->unregister_bitmap = bitmap_new(block->nb_chunks); | |||
574 | bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); | |||
575 | block->remote_keys = g_malloc0(block->nb_chunks * sizeof(uint32_t)); | |||
576 | ||||
577 | block->is_ram_block = local->init ? false0 : true1; | |||
578 | ||||
579 | g_hash_table_insert(rdma->blockmap, (void *) block_offset, block); | |||
580 | ||||
581 | DDPRINTF("Added Block: %d, addr: %" PRIu64 ", offset: %" PRIu64do { } while (0) | |||
582 | " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",do { } while (0) | |||
583 | local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,do { } while (0) | |||
584 | block->length, (uint64_t) (block->local_host_addr + block->length),do { } while (0) | |||
585 | BITS_TO_LONGS(block->nb_chunks) *do { } while (0) | |||
586 | sizeof(unsigned long) * 8, block->nb_chunks)do { } while (0); | |||
587 | ||||
588 | local->nb_blocks++; | |||
589 | ||||
590 | return 0; | |||
591 | } | |||
592 | ||||
593 | /* | |||
594 | * Memory regions need to be registered with the device and queue pairs setup | |||
595 | * in advanced before the migration starts. This tells us where the RAM blocks | |||
596 | * are so that we can register them individually. | |||
597 | */ | |||
598 | static void qemu_rdma_init_one_block(void *host_addr, | |||
599 | ram_addr_t block_offset, ram_addr_t length, void *opaque) | |||
600 | { | |||
601 | __qemu_rdma_add_block(opaque, host_addr, block_offset, length); | |||
602 | } | |||
603 | ||||
604 | /* | |||
605 | * Identify the RAMBlocks and their quantity. They will be references to | |||
606 | * identify chunk boundaries inside each RAMBlock and also be referenced | |||
607 | * during dynamic page registration. | |||
608 | */ | |||
609 | static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) | |||
610 | { | |||
611 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
612 | ||||
613 | assert(rdma->blockmap == NULL)((rdma->blockmap == ((void*)0)) ? (void) (0) : __assert_fail ("rdma->blockmap == ((void*)0)", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c" , 613, __PRETTY_FUNCTION__)); | |||
614 | rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); | |||
615 | memset(local, 0, sizeof *local); | |||
616 | qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); | |||
617 | DPRINTF("Allocated %d local ram block structures\n", local->nb_blocks)do { } while (0); | |||
618 | rdma->block = (RDMARemoteBlock *) g_malloc0(sizeof(RDMARemoteBlock) * | |||
619 | rdma->local_ram_blocks.nb_blocks); | |||
620 | local->init = true1; | |||
621 | return 0; | |||
622 | } | |||
623 | ||||
624 | static int __qemu_rdma_delete_block(RDMAContext *rdma, ram_addr_t block_offset) | |||
625 | { | |||
626 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
627 | RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, | |||
628 | (void *) block_offset); | |||
629 | RDMALocalBlock *old = local->block; | |||
630 | int x; | |||
631 | ||||
632 | assert(block)((block) ? (void) (0) : __assert_fail ("block", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c" , 632, __PRETTY_FUNCTION__)); | |||
633 | ||||
634 | if (block->pmr) { | |||
635 | int j; | |||
636 | ||||
637 | for (j = 0; j < block->nb_chunks; j++) { | |||
638 | if (!block->pmr[j]) { | |||
639 | continue; | |||
640 | } | |||
641 | ibv_dereg_mr(block->pmr[j]); | |||
642 | rdma->total_registrations--; | |||
643 | } | |||
644 | g_free(block->pmr); | |||
645 | block->pmr = NULL((void*)0); | |||
646 | } | |||
647 | ||||
648 | if (block->mr) { | |||
649 | ibv_dereg_mr(block->mr); | |||
650 | rdma->total_registrations--; | |||
651 | block->mr = NULL((void*)0); | |||
652 | } | |||
653 | ||||
654 | g_free(block->transit_bitmap); | |||
655 | block->transit_bitmap = NULL((void*)0); | |||
656 | ||||
657 | g_free(block->unregister_bitmap); | |||
658 | block->unregister_bitmap = NULL((void*)0); | |||
659 | ||||
660 | g_free(block->remote_keys); | |||
661 | block->remote_keys = NULL((void*)0); | |||
662 | ||||
663 | for (x = 0; x < local->nb_blocks; x++) { | |||
664 | g_hash_table_remove(rdma->blockmap, (void *)old[x].offset); | |||
665 | } | |||
666 | ||||
667 | if (local->nb_blocks > 1) { | |||
668 | ||||
669 | local->block = g_malloc0(sizeof(RDMALocalBlock) * | |||
670 | (local->nb_blocks - 1)); | |||
671 | ||||
672 | if (block->index) { | |||
673 | memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); | |||
674 | } | |||
675 | ||||
676 | if (block->index < (local->nb_blocks - 1)) { | |||
677 | memcpy(local->block + block->index, old + (block->index + 1), | |||
678 | sizeof(RDMALocalBlock) * | |||
679 | (local->nb_blocks - (block->index + 1))); | |||
680 | } | |||
681 | } else { | |||
682 | assert(block == local->block)((block == local->block) ? (void) (0) : __assert_fail ("block == local->block" , "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 682 , __PRETTY_FUNCTION__)); | |||
683 | local->block = NULL((void*)0); | |||
684 | } | |||
685 | ||||
686 | DDPRINTF("Deleted Block: %d, addr: %" PRIu64 ", offset: %" PRIu64do { } while (0) | |||
687 | " length: %" PRIu64 " end: %" PRIu64 " bits %" PRIu64 " chunks %d\n",do { } while (0) | |||
688 | local->nb_blocks, (uint64_t) block->local_host_addr, block->offset,do { } while (0) | |||
689 | block->length, (uint64_t) (block->local_host_addr + block->length),do { } while (0) | |||
690 | BITS_TO_LONGS(block->nb_chunks) *do { } while (0) | |||
691 | sizeof(unsigned long) * 8, block->nb_chunks)do { } while (0); | |||
692 | ||||
693 | g_free(old); | |||
694 | ||||
695 | local->nb_blocks--; | |||
696 | ||||
697 | if (local->nb_blocks) { | |||
698 | for (x = 0; x < local->nb_blocks; x++) { | |||
699 | g_hash_table_insert(rdma->blockmap, (void *)local->block[x].offset, | |||
700 | &local->block[x]); | |||
701 | } | |||
702 | } | |||
703 | ||||
704 | return 0; | |||
705 | } | |||
706 | ||||
707 | /* | |||
708 | * Put in the log file which RDMA device was opened and the details | |||
709 | * associated with that device. | |||
710 | */ | |||
711 | static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) | |||
712 | { | |||
713 | struct ibv_port_attr port; | |||
714 | ||||
715 | if (ibv_query_port(verbs, 1, &port)___ibv_query_port(verbs, 1, &port)) { | |||
716 | fprintf(stderrstderr, "FAILED TO QUERY PORT INFORMATION!\n"); | |||
717 | return; | |||
718 | } | |||
719 | ||||
720 | printf("%s RDMA Device opened: kernel name %s " | |||
721 | "uverbs device name %s, " | |||
722 | "infiniband_verbs class device path %s, " | |||
723 | "infiniband class device path %s, " | |||
724 | "transport: (%d) %s\n", | |||
725 | who, | |||
726 | verbs->device->name, | |||
727 | verbs->device->dev_name, | |||
728 | verbs->device->dev_path, | |||
729 | verbs->device->ibdev_path, | |||
730 | port.link_layer, | |||
731 | (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" : | |||
732 | ((port.link_layer == IBV_LINK_LAYER_ETHERNET) | |||
733 | ? "Ethernet" : "Unknown")); | |||
734 | } | |||
735 | ||||
736 | /* | |||
737 | * Put in the log file the RDMA gid addressing information, | |||
738 | * useful for folks who have trouble understanding the | |||
739 | * RDMA device hierarchy in the kernel. | |||
740 | */ | |||
741 | static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) | |||
742 | { | |||
743 | char sgid[33]; | |||
744 | char dgid[33]; | |||
745 | inet_ntop(AF_INET610, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); | |||
746 | inet_ntop(AF_INET610, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); | |||
747 | DPRINTF("%s Source GID: %s, Dest GID: %s\n", who, sgid, dgid)do { } while (0); | |||
748 | } | |||
749 | ||||
750 | /* | |||
751 | * As of now, IPv6 over RoCE / iWARP is not supported by linux. | |||
752 | * We will try the next addrinfo struct, and fail if there are | |||
753 | * no other valid addresses to bind against. | |||
754 | * | |||
755 | * If user is listening on '[::]', then we will not have a opened a device | |||
756 | * yet and have no way of verifying if the device is RoCE or not. | |||
757 | * | |||
758 | * In this case, the source VM will throw an error for ALL types of | |||
759 | * connections (both IPv4 and IPv6) if the destination machine does not have | |||
760 | * a regular infiniband network available for use. | |||
761 | * | |||
762 | * The only way to guarantee that an error is thrown for broken kernels is | |||
763 | * for the management software to choose a *specific* interface at bind time | |||
764 | * and validate what time of hardware it is. | |||
765 | * | |||
766 | * Unfortunately, this puts the user in a fix: | |||
767 | * | |||
768 | * If the source VM connects with an IPv4 address without knowing that the | |||
769 | * destination has bound to '[::]' the migration will unconditionally fail | |||
770 | * unless the management software is explicitly listening on the the IPv4 | |||
771 | * address while using a RoCE-based device. | |||
772 | * | |||
773 | * If the source VM connects with an IPv6 address, then we're OK because we can | |||
774 | * throw an error on the source (and similarly on the destination). | |||
775 | * | |||
776 | * But in mixed environments, this will be broken for a while until it is fixed | |||
777 | * inside linux. | |||
778 | * | |||
779 | * We do provide a *tiny* bit of help in this function: We can list all of the | |||
780 | * devices in the system and check to see if all the devices are RoCE or | |||
781 | * Infiniband. | |||
782 | * | |||
783 | * If we detect that we have a *pure* RoCE environment, then we can safely | |||
784 | * thrown an error even if the management software has specified '[::]' as the | |||
785 | * bind address. | |||
786 | * | |||
787 | * However, if there is are multiple hetergeneous devices, then we cannot make | |||
788 | * this assumption and the user just has to be sure they know what they are | |||
789 | * doing. | |||
790 | * | |||
791 | * Patches are being reviewed on linux-rdma. | |||
792 | */ | |||
793 | static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs) | |||
794 | { | |||
795 | struct ibv_port_attr port_attr; | |||
796 | ||||
797 | /* This bug only exists in linux, to our knowledge. */ | |||
798 | #ifdef CONFIG_LINUX1 | |||
799 | ||||
800 | /* | |||
801 | * Verbs are only NULL if management has bound to '[::]'. | |||
802 | * | |||
803 | * Let's iterate through all the devices and see if there any pure IB | |||
804 | * devices (non-ethernet). | |||
805 | * | |||
806 | * If not, then we can safely proceed with the migration. | |||
807 | * Otherwise, there are no guarantees until the bug is fixed in linux. | |||
808 | */ | |||
809 | if (!verbs) { | |||
810 | int num_devices, x; | |||
811 | struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); | |||
812 | bool_Bool roce_found = false0; | |||
813 | bool_Bool ib_found = false0; | |||
814 | ||||
815 | for (x = 0; x < num_devices; x++) { | |||
816 | verbs = ibv_open_device(dev_list[x]); | |||
817 | ||||
818 | if (ibv_query_port(verbs, 1, &port_attr)___ibv_query_port(verbs, 1, &port_attr)) { | |||
819 | ibv_close_device(verbs); | |||
820 | ERROR(errp, "Could not query initial IB port")do { fprintf(stderr, "RDMA ERROR: " "Could not query initial IB port" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Could not query initial IB port" ); } } while (0); | |||
821 | return -EINVAL22; | |||
822 | } | |||
823 | ||||
824 | if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { | |||
825 | ib_found = true1; | |||
826 | } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { | |||
827 | roce_found = true1; | |||
828 | } | |||
829 | ||||
830 | ibv_close_device(verbs); | |||
831 | ||||
832 | } | |||
833 | ||||
834 | if (roce_found) { | |||
835 | if (ib_found) { | |||
836 | fprintf(stderrstderr, "WARN: migrations may fail:" | |||
837 | " IPv6 over RoCE / iWARP in linux" | |||
838 | " is broken. But since you appear to have a" | |||
839 | " mixed RoCE / IB environment, be sure to only" | |||
840 | " migrate over the IB fabric until the kernel " | |||
841 | " fixes the bug.\n"); | |||
842 | } else { | |||
843 | ERROR(errp, "You only have RoCE / iWARP devices in your systems"do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." ); } } while (0) | |||
844 | " and your management software has specified '[::]'"do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." ); } } while (0) | |||
845 | ", but IPv6 over RoCE / iWARP is not supported in Linux.")do { fprintf(stderr, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "You only have RoCE / iWARP devices in your systems" " and your management software has specified '[::]'" ", but IPv6 over RoCE / iWARP is not supported in Linux." ); } } while (0); | |||
846 | return -ENONET64; | |||
847 | } | |||
848 | } | |||
849 | ||||
850 | return 0; | |||
851 | } | |||
852 | ||||
853 | /* | |||
854 | * If we have a verbs context, that means that some other than '[::]' was | |||
855 | * used by the management software for binding. In which case we can actually | |||
856 | * warn the user about a potential broken kernel; | |||
857 | */ | |||
858 | ||||
859 | /* IB ports start with 1, not 0 */ | |||
860 | if (ibv_query_port(verbs, 1, &port_attr)___ibv_query_port(verbs, 1, &port_attr)) { | |||
861 | ERROR(errp, "Could not query initial IB port")do { fprintf(stderr, "RDMA ERROR: " "Could not query initial IB port" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Could not query initial IB port" ); } } while (0); | |||
862 | return -EINVAL22; | |||
863 | } | |||
864 | ||||
865 | if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { | |||
866 | ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 "do { fprintf(stderr, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 " "(but patches on linux-rdma in progress)" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 " "(but patches on linux-rdma in progress)"); } } while (0) | |||
867 | "(but patches on linux-rdma in progress)")do { fprintf(stderr, "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 " "(but patches on linux-rdma in progress)" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "Linux kernel's RoCE / iWARP does not support IPv6 " "(but patches on linux-rdma in progress)"); } } while (0); | |||
868 | return -ENONET64; | |||
869 | } | |||
870 | ||||
871 | #endif | |||
872 | ||||
873 | return 0; | |||
874 | } | |||
875 | ||||
876 | /* | |||
877 | * Figure out which RDMA device corresponds to the requested IP hostname | |||
878 | * Also create the initial connection manager identifiers for opening | |||
879 | * the connection. | |||
880 | */ | |||
881 | static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) | |||
882 | { | |||
883 | int ret; | |||
884 | struct rdma_addrinfo *res; | |||
885 | char port_str[16]; | |||
886 | struct rdma_cm_event *cm_event; | |||
887 | char ip[40] = "unknown"; | |||
888 | struct rdma_addrinfo *e; | |||
889 | ||||
890 | if (rdma->host == NULL((void*)0) || !strcmp(rdma->host, "")) { | |||
891 | ERROR(errp, "RDMA hostname has not been set")do { fprintf(stderr, "RDMA ERROR: " "RDMA hostname has not been set" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA hostname has not been set" ); } } while (0); | |||
892 | return -EINVAL22; | |||
893 | } | |||
894 | ||||
895 | /* create CM channel */ | |||
896 | rdma->channel = rdma_create_event_channel(); | |||
897 | if (!rdma->channel) { | |||
898 | ERROR(errp, "could not create CM channel")do { fprintf(stderr, "RDMA ERROR: " "could not create CM channel" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create CM channel" ); } } while (0); | |||
899 | return -EINVAL22; | |||
900 | } | |||
901 | ||||
902 | /* create CM id */ | |||
903 | ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL((void*)0), RDMA_PS_TCP); | |||
904 | if (ret) { | |||
905 | ERROR(errp, "could not create channel id")do { fprintf(stderr, "RDMA ERROR: " "could not create channel id" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create channel id" ); } } while (0); | |||
906 | goto err_resolve_create_id; | |||
907 | } | |||
908 | ||||
909 | snprintf(port_str, 16, "%d", rdma->port); | |||
910 | port_str[15] = '\0'; | |||
911 | ||||
912 | ret = rdma_getaddrinfo(rdma->host, port_str, NULL((void*)0), &res); | |||
913 | if (ret < 0) { | |||
914 | ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not rdma_getaddrinfo address %s" "\n", rdma->host); if (errp && (*(errp) == ((void *)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not rdma_getaddrinfo address %s", rdma->host); } } while (0); | |||
915 | goto err_resolve_get_addr; | |||
916 | } | |||
917 | ||||
918 | for (e = res; e != NULL((void*)0); e = e->ai_next) { | |||
919 | inet_ntop(e->ai_family, | |||
920 | &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); | |||
921 | DPRINTF("Trying %s => %s\n", rdma->host, ip)do { } while (0); | |||
922 | ||||
923 | ret = rdma_resolve_addr(rdma->cm_id, NULL((void*)0), e->ai_dst_addr, | |||
924 | RDMA_RESOLVE_TIMEOUT_MS10000); | |||
925 | if (!ret) { | |||
926 | if (e->ai_family == AF_INET610) { | |||
927 | ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs); | |||
928 | if (ret) { | |||
929 | continue; | |||
930 | } | |||
931 | } | |||
932 | goto route; | |||
933 | } | |||
934 | } | |||
935 | ||||
936 | ERROR(errp, "could not resolve address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not resolve address %s" "\n", rdma->host); if (errp && (*(errp) == ((void *)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not resolve address %s", rdma->host); } } while (0 ); | |||
937 | goto err_resolve_get_addr; | |||
938 | ||||
939 | route: | |||
940 | qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); | |||
941 | ||||
942 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
943 | if (ret) { | |||
944 | ERROR(errp, "could not perform event_addr_resolved")do { fprintf(stderr, "RDMA ERROR: " "could not perform event_addr_resolved" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not perform event_addr_resolved" ); } } while (0); | |||
945 | goto err_resolve_get_addr; | |||
946 | } | |||
947 | ||||
948 | if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { | |||
949 | ERROR(errp, "result not equal to event_addr_resolved %s",do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_addr_resolved %s" "\n", rdma_event_str(cm_event->event)); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "result not equal to event_addr_resolved %s" , rdma_event_str(cm_event->event)); } } while (0) | |||
950 | rdma_event_str(cm_event->event))do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_addr_resolved %s" "\n", rdma_event_str(cm_event->event)); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "result not equal to event_addr_resolved %s" , rdma_event_str(cm_event->event)); } } while (0); | |||
951 | perror("rdma_resolve_addr"); | |||
952 | ret = -EINVAL22; | |||
953 | goto err_resolve_get_addr; | |||
954 | } | |||
955 | rdma_ack_cm_event(cm_event); | |||
956 | ||||
957 | /* resolve route */ | |||
958 | ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS10000); | |||
959 | if (ret) { | |||
960 | ERROR(errp, "could not resolve rdma route")do { fprintf(stderr, "RDMA ERROR: " "could not resolve rdma route" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not resolve rdma route" ); } } while (0); | |||
961 | goto err_resolve_get_addr; | |||
962 | } | |||
963 | ||||
964 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
965 | if (ret) { | |||
966 | ERROR(errp, "could not perform event_route_resolved")do { fprintf(stderr, "RDMA ERROR: " "could not perform event_route_resolved" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not perform event_route_resolved" ); } } while (0); | |||
967 | goto err_resolve_get_addr; | |||
968 | } | |||
969 | if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { | |||
970 | ERROR(errp, "result not equal to event_route_resolved: %s",do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_route_resolved: %s" "\n", rdma_event_str(cm_event->event)); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "result not equal to event_route_resolved: %s" , rdma_event_str(cm_event->event)); } } while (0) | |||
971 | rdma_event_str(cm_event->event))do { fprintf(stderr, "RDMA ERROR: " "result not equal to event_route_resolved: %s" "\n", rdma_event_str(cm_event->event)); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "result not equal to event_route_resolved: %s" , rdma_event_str(cm_event->event)); } } while (0); | |||
972 | rdma_ack_cm_event(cm_event); | |||
973 | ret = -EINVAL22; | |||
974 | goto err_resolve_get_addr; | |||
975 | } | |||
976 | rdma_ack_cm_event(cm_event); | |||
977 | rdma->verbs = rdma->cm_id->verbs; | |||
978 | qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs); | |||
979 | qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id); | |||
980 | return 0; | |||
981 | ||||
982 | err_resolve_get_addr: | |||
983 | rdma_destroy_id(rdma->cm_id); | |||
984 | rdma->cm_id = NULL((void*)0); | |||
985 | err_resolve_create_id: | |||
986 | rdma_destroy_event_channel(rdma->channel); | |||
987 | rdma->channel = NULL((void*)0); | |||
988 | return ret; | |||
989 | } | |||
990 | ||||
991 | /* | |||
992 | * Create protection domain and completion queues | |||
993 | */ | |||
994 | static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) | |||
995 | { | |||
996 | /* allocate pd */ | |||
997 | rdma->pd = ibv_alloc_pd(rdma->verbs); | |||
998 | if (!rdma->pd) { | |||
999 | fprintf(stderrstderr, "failed to allocate protection domain\n"); | |||
1000 | return -1; | |||
1001 | } | |||
1002 | ||||
1003 | /* create completion channel */ | |||
1004 | rdma->comp_channel = ibv_create_comp_channel(rdma->verbs); | |||
1005 | if (!rdma->comp_channel) { | |||
1006 | fprintf(stderrstderr, "failed to allocate completion channel\n"); | |||
1007 | goto err_alloc_pd_cq; | |||
1008 | } | |||
1009 | ||||
1010 | /* | |||
1011 | * Completion queue can be filled by both read and write work requests, | |||
1012 | * so must reflect the sum of both possible queue sizes. | |||
1013 | */ | |||
1014 | rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096) * 3), | |||
1015 | NULL((void*)0), rdma->comp_channel, 0); | |||
1016 | if (!rdma->cq) { | |||
1017 | fprintf(stderrstderr, "failed to allocate completion queue\n"); | |||
1018 | goto err_alloc_pd_cq; | |||
1019 | } | |||
1020 | ||||
1021 | return 0; | |||
1022 | ||||
1023 | err_alloc_pd_cq: | |||
1024 | if (rdma->pd) { | |||
1025 | ibv_dealloc_pd(rdma->pd); | |||
1026 | } | |||
1027 | if (rdma->comp_channel) { | |||
1028 | ibv_destroy_comp_channel(rdma->comp_channel); | |||
1029 | } | |||
1030 | rdma->pd = NULL((void*)0); | |||
1031 | rdma->comp_channel = NULL((void*)0); | |||
1032 | return -1; | |||
1033 | ||||
1034 | } | |||
1035 | ||||
1036 | /* | |||
1037 | * Create queue pairs. | |||
1038 | */ | |||
1039 | static int qemu_rdma_alloc_qp(RDMAContext *rdma) | |||
1040 | { | |||
1041 | struct ibv_qp_init_attr attr = { 0 }; | |||
1042 | int ret; | |||
1043 | ||||
1044 | attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096); | |||
1045 | attr.cap.max_recv_wr = 3; | |||
1046 | attr.cap.max_send_sge = 1; | |||
1047 | attr.cap.max_recv_sge = 1; | |||
1048 | attr.send_cq = rdma->cq; | |||
1049 | attr.recv_cq = rdma->cq; | |||
1050 | attr.qp_type = IBV_QPT_RC; | |||
1051 | ||||
1052 | ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); | |||
1053 | if (ret) { | |||
1054 | return -1; | |||
1055 | } | |||
1056 | ||||
1057 | rdma->qp = rdma->cm_id->qp; | |||
1058 | return 0; | |||
1059 | } | |||
1060 | ||||
1061 | static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) | |||
1062 | { | |||
1063 | int i; | |||
1064 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
1065 | ||||
1066 | for (i = 0; i < local->nb_blocks; i++) { | |||
1067 | local->block[i].mr = | |||
1068 | ibv_reg_mr(rdma->pd, | |||
1069 | local->block[i].local_host_addr, | |||
1070 | local->block[i].length, | |||
1071 | IBV_ACCESS_LOCAL_WRITE | | |||
1072 | IBV_ACCESS_REMOTE_WRITE | |||
1073 | ); | |||
1074 | if (!local->block[i].mr) { | |||
1075 | perror("Failed to register local dest ram block!\n"); | |||
1076 | break; | |||
1077 | } | |||
1078 | rdma->total_registrations++; | |||
1079 | } | |||
1080 | ||||
1081 | if (i >= local->nb_blocks) { | |||
1082 | return 0; | |||
1083 | } | |||
1084 | ||||
1085 | for (i--; i >= 0; i--) { | |||
1086 | ibv_dereg_mr(local->block[i].mr); | |||
1087 | rdma->total_registrations--; | |||
1088 | } | |||
1089 | ||||
1090 | return -1; | |||
1091 | ||||
1092 | } | |||
1093 | ||||
1094 | /* | |||
1095 | * Find the ram block that corresponds to the page requested to be | |||
1096 | * transmitted by QEMU. | |||
1097 | * | |||
1098 | * Once the block is found, also identify which 'chunk' within that | |||
1099 | * block that the page belongs to. | |||
1100 | * | |||
1101 | * This search cannot fail or the migration will fail. | |||
1102 | */ | |||
1103 | static int qemu_rdma_search_ram_block(RDMAContext *rdma, | |||
1104 | uint64_t block_offset, | |||
1105 | uint64_t offset, | |||
1106 | uint64_t length, | |||
1107 | uint64_t *block_index, | |||
1108 | uint64_t *chunk_index) | |||
1109 | { | |||
1110 | uint64_t current_addr = block_offset + offset; | |||
1111 | RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, | |||
1112 | (void *) block_offset); | |||
1113 | assert(block)((block) ? (void) (0) : __assert_fail ("block", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c" , 1113, __PRETTY_FUNCTION__)); | |||
1114 | assert(current_addr >= block->offset)((current_addr >= block->offset) ? (void) (0) : __assert_fail ("current_addr >= block->offset", "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c" , 1114, __PRETTY_FUNCTION__)); | |||
1115 | assert((current_addr + length) <= (block->offset + block->length))(((current_addr + length) <= (block->offset + block-> length)) ? (void) (0) : __assert_fail ("(current_addr + length) <= (block->offset + block->length)" , "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 1115 , __PRETTY_FUNCTION__)); | |||
1116 | ||||
1117 | *block_index = block->index; | |||
1118 | *chunk_index = ram_chunk_index(block->local_host_addr, | |||
1119 | block->local_host_addr + (current_addr - block->offset)); | |||
1120 | ||||
1121 | return 0; | |||
1122 | } | |||
1123 | ||||
1124 | /* | |||
1125 | * Register a chunk with IB. If the chunk was already registered | |||
1126 | * previously, then skip. | |||
1127 | * | |||
1128 | * Also return the keys associated with the registration needed | |||
1129 | * to perform the actual RDMA operation. | |||
1130 | */ | |||
1131 | static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, | |||
1132 | RDMALocalBlock *block, uint8_t *host_addr, | |||
1133 | uint32_t *lkey, uint32_t *rkey, int chunk, | |||
1134 | uint8_t *chunk_start, uint8_t *chunk_end) | |||
1135 | { | |||
1136 | if (block->mr) { | |||
1137 | if (lkey) { | |||
1138 | *lkey = block->mr->lkey; | |||
1139 | } | |||
1140 | if (rkey) { | |||
1141 | *rkey = block->mr->rkey; | |||
1142 | } | |||
1143 | return 0; | |||
1144 | } | |||
1145 | ||||
1146 | /* allocate memory to store chunk MRs */ | |||
1147 | if (!block->pmr) { | |||
1148 | block->pmr = g_malloc0(block->nb_chunks * sizeof(struct ibv_mr *)); | |||
1149 | if (!block->pmr) { | |||
1150 | return -1; | |||
1151 | } | |||
1152 | } | |||
1153 | ||||
1154 | /* | |||
1155 | * If 'rkey', then we're the destination, so grant access to the source. | |||
1156 | * | |||
1157 | * If 'lkey', then we're the source VM, so grant access only to ourselves. | |||
1158 | */ | |||
1159 | if (!block->pmr[chunk]) { | |||
1160 | uint64_t len = chunk_end - chunk_start; | |||
1161 | ||||
1162 | DDPRINTF("Registering %" PRIu64 " bytes @ %p\n",do { } while (0) | |||
1163 | len, chunk_start)do { } while (0); | |||
1164 | ||||
1165 | block->pmr[chunk] = ibv_reg_mr(rdma->pd, | |||
1166 | chunk_start, len, | |||
1167 | (rkey ? (IBV_ACCESS_LOCAL_WRITE | | |||
1168 | IBV_ACCESS_REMOTE_WRITE) : 0)); | |||
1169 | ||||
1170 | if (!block->pmr[chunk]) { | |||
1171 | perror("Failed to register chunk!"); | |||
1172 | fprintf(stderrstderr, "Chunk details: block: %d chunk index %d" | |||
1173 | " start %" PRIu64"l" "u" " end %" PRIu64"l" "u" " host %" PRIu64"l" "u" | |||
1174 | " local %" PRIu64"l" "u" " registrations: %d\n", | |||
1175 | block->index, chunk, (uint64_t) chunk_start, | |||
1176 | (uint64_t) chunk_end, (uint64_t) host_addr, | |||
1177 | (uint64_t) block->local_host_addr, | |||
1178 | rdma->total_registrations); | |||
1179 | return -1; | |||
1180 | } | |||
1181 | rdma->total_registrations++; | |||
1182 | } | |||
1183 | ||||
1184 | if (lkey) { | |||
1185 | *lkey = block->pmr[chunk]->lkey; | |||
1186 | } | |||
1187 | if (rkey) { | |||
1188 | *rkey = block->pmr[chunk]->rkey; | |||
1189 | } | |||
1190 | return 0; | |||
1191 | } | |||
1192 | ||||
1193 | /* | |||
1194 | * Register (at connection time) the memory used for control | |||
1195 | * channel messages. | |||
1196 | */ | |||
1197 | static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) | |||
1198 | { | |||
1199 | rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, | |||
1200 | rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER(512 * 1024), | |||
1201 | IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); | |||
1202 | if (rdma->wr_data[idx].control_mr) { | |||
1203 | rdma->total_registrations++; | |||
1204 | return 0; | |||
1205 | } | |||
1206 | fprintf(stderrstderr, "qemu_rdma_reg_control failed!\n"); | |||
1207 | return -1; | |||
1208 | } | |||
1209 | ||||
1210 | const char *print_wrid(int wrid) | |||
1211 | { | |||
1212 | if (wrid >= RDMA_WRID_RECV_CONTROL) { | |||
1213 | return wrid_desc[RDMA_WRID_RECV_CONTROL]; | |||
1214 | } | |||
1215 | return wrid_desc[wrid]; | |||
1216 | } | |||
1217 | ||||
1218 | /* | |||
1219 | * RDMA requires memory registration (mlock/pinning), but this is not good for | |||
1220 | * overcommitment. | |||
1221 | * | |||
1222 | * In preparation for the future where LRU information or workload-specific | |||
1223 | * writable writable working set memory access behavior is available to QEMU | |||
1224 | * it would be nice to have in place the ability to UN-register/UN-pin | |||
1225 | * particular memory regions from the RDMA hardware when it is determine that | |||
1226 | * those regions of memory will likely not be accessed again in the near future. | |||
1227 | * | |||
1228 | * While we do not yet have such information right now, the following | |||
1229 | * compile-time option allows us to perform a non-optimized version of this | |||
1230 | * behavior. | |||
1231 | * | |||
1232 | * By uncommenting this option, you will cause *all* RDMA transfers to be | |||
1233 | * unregistered immediately after the transfer completes on both sides of the | |||
1234 | * connection. This has no effect in 'rdma-pin-all' mode, only regular mode. | |||
1235 | * | |||
1236 | * This will have a terrible impact on migration performance, so until future | |||
1237 | * workload information or LRU information is available, do not attempt to use | |||
1238 | * this feature except for basic testing. | |||
1239 | */ | |||
1240 | //#define RDMA_UNREGISTRATION_EXAMPLE | |||
1241 | ||||
1242 | /* | |||
1243 | * Perform a non-optimized memory unregistration after every transfer | |||
1244 | * for demonsration purposes, only if pin-all is not requested. | |||
1245 | * | |||
1246 | * Potential optimizations: | |||
1247 | * 1. Start a new thread to run this function continuously | |||
1248 | - for bit clearing | |||
1249 | - and for receipt of unregister messages | |||
1250 | * 2. Use an LRU. | |||
1251 | * 3. Use workload hints. | |||
1252 | */ | |||
1253 | static int qemu_rdma_unregister_waiting(RDMAContext *rdma) | |||
1254 | { | |||
1255 | while (rdma->unregistrations[rdma->unregister_current]) { | |||
1256 | int ret; | |||
1257 | uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; | |||
1258 | uint64_t chunk = | |||
1259 | (wr_id & RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL )) & ~((1UL << 16UL) - 1UL))) >> RDMA_WRID_CHUNK_SHIFT30UL; | |||
1260 | uint64_t index = | |||
1261 | (wr_id & RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL ))) >> RDMA_WRID_BLOCK_SHIFT16UL; | |||
1262 | RDMALocalBlock *block = | |||
1263 | &(rdma->local_ram_blocks.block[index]); | |||
1264 | RDMARegister reg = { .current_index = index }; | |||
1265 | RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED, | |||
1266 | }; | |||
1267 | RDMAControlHeader head = { .len = sizeof(RDMARegister), | |||
1268 | .type = RDMA_CONTROL_UNREGISTER_REQUEST, | |||
1269 | .repeat = 1, | |||
1270 | }; | |||
1271 | ||||
1272 | DDPRINTF("Processing unregister for chunk: %" PRIu64do { } while (0) | |||
1273 | " at position %d\n", chunk, rdma->unregister_current)do { } while (0); | |||
1274 | ||||
1275 | rdma->unregistrations[rdma->unregister_current] = 0; | |||
1276 | rdma->unregister_current++; | |||
1277 | ||||
1278 | if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)) { | |||
1279 | rdma->unregister_current = 0; | |||
1280 | } | |||
1281 | ||||
1282 | ||||
1283 | /* | |||
1284 | * Unregistration is speculative (because migration is single-threaded | |||
1285 | * and we cannot break the protocol's inifinband message ordering). | |||
1286 | * Thus, if the memory is currently being used for transmission, | |||
1287 | * then abort the attempt to unregister and try again | |||
1288 | * later the next time a completion is received for this memory. | |||
1289 | */ | |||
1290 | clear_bit(chunk, block->unregister_bitmap); | |||
1291 | ||||
1292 | if (test_bit(chunk, block->transit_bitmap)) { | |||
1293 | DDPRINTF("Cannot unregister inflight chunk: %" PRIu64 "\n", chunk)do { } while (0); | |||
1294 | continue; | |||
1295 | } | |||
1296 | ||||
1297 | DDPRINTF("Sending unregister for chunk: %" PRIu64 "\n", chunk)do { } while (0); | |||
1298 | ||||
1299 | ret = ibv_dereg_mr(block->pmr[chunk]); | |||
1300 | block->pmr[chunk] = NULL((void*)0); | |||
1301 | block->remote_keys[chunk] = 0; | |||
1302 | ||||
1303 | if (ret != 0) { | |||
1304 | perror("unregistration chunk failed"); | |||
1305 | return -ret; | |||
1306 | } | |||
1307 | rdma->total_registrations--; | |||
1308 | ||||
1309 | reg.key.chunk = chunk; | |||
1310 | register_to_network(®); | |||
1311 | ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, | |||
1312 | &resp, NULL((void*)0), NULL((void*)0)); | |||
1313 | if (ret < 0) { | |||
1314 | return ret; | |||
1315 | } | |||
1316 | ||||
1317 | DDPRINTF("Unregister for chunk: %" PRIu64 " complete.\n", chunk)do { } while (0); | |||
1318 | } | |||
1319 | ||||
1320 | return 0; | |||
1321 | } | |||
1322 | ||||
1323 | static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index, | |||
1324 | uint64_t chunk) | |||
1325 | { | |||
1326 | uint64_t result = wr_id & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL); | |||
1327 | ||||
1328 | result |= (index << RDMA_WRID_BLOCK_SHIFT16UL); | |||
1329 | result |= (chunk << RDMA_WRID_CHUNK_SHIFT30UL); | |||
1330 | ||||
1331 | return result; | |||
1332 | } | |||
1333 | ||||
1334 | /* | |||
1335 | * Set bit for unregistration in the next iteration. | |||
1336 | * We cannot transmit right here, but will unpin later. | |||
1337 | */ | |||
1338 | static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, | |||
1339 | uint64_t chunk, uint64_t wr_id) | |||
1340 | { | |||
1341 | if (rdma->unregistrations[rdma->unregister_next] != 0) { | |||
1342 | fprintf(stderrstderr, "rdma migration: queue is full!\n"); | |||
1343 | } else { | |||
1344 | RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); | |||
1345 | ||||
1346 | if (!test_and_set_bit(chunk, block->unregister_bitmap)) { | |||
1347 | DDPRINTF("Appending unregister chunk %" PRIu64do { } while (0) | |||
1348 | " at position %d\n", chunk, rdma->unregister_next)do { } while (0); | |||
1349 | ||||
1350 | rdma->unregistrations[rdma->unregister_next++] = | |||
1351 | qemu_rdma_make_wrid(wr_id, index, chunk); | |||
1352 | ||||
1353 | if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX((2 * 1024 * 1024) / 4096)) { | |||
1354 | rdma->unregister_next = 0; | |||
1355 | } | |||
1356 | } else { | |||
1357 | DDPRINTF("Unregister chunk %" PRIu64 " already in queue.\n",do { } while (0) | |||
1358 | chunk)do { } while (0); | |||
1359 | } | |||
1360 | } | |||
1361 | } | |||
1362 | ||||
1363 | /* | |||
1364 | * Consult the connection manager to see a work request | |||
1365 | * (of any kind) has completed. | |||
1366 | * Return the work request ID that completed. | |||
1367 | */ | |||
1368 | static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, | |||
1369 | uint32_t *byte_len) | |||
1370 | { | |||
1371 | int ret; | |||
1372 | struct ibv_wc wc; | |||
1373 | uint64_t wr_id; | |||
1374 | ||||
1375 | ret = ibv_poll_cq(rdma->cq, 1, &wc); | |||
1376 | ||||
1377 | if (!ret) { | |||
1378 | *wr_id_out = RDMA_WRID_NONE; | |||
1379 | return 0; | |||
1380 | } | |||
1381 | ||||
1382 | if (ret < 0) { | |||
1383 | fprintf(stderrstderr, "ibv_poll_cq return %d!\n", ret); | |||
1384 | return ret; | |||
1385 | } | |||
1386 | ||||
1387 | wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL); | |||
1388 | ||||
1389 | if (wc.status != IBV_WC_SUCCESS) { | |||
1390 | fprintf(stderrstderr, "ibv_poll_cq wc.status=%d %s!\n", | |||
1391 | wc.status, ibv_wc_status_str(wc.status)); | |||
1392 | fprintf(stderrstderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]); | |||
1393 | ||||
1394 | return -1; | |||
1395 | } | |||
1396 | ||||
1397 | if (rdma->control_ready_expected && | |||
1398 | (wr_id >= RDMA_WRID_RECV_CONTROL)) { | |||
1399 | DDDPRINTF("completion %s #%" PRId64 " received (%" PRId64 ")"do { } while (0) | |||
1400 | " left %d\n", wrid_desc[RDMA_WRID_RECV_CONTROL],do { } while (0) | |||
1401 | wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent)do { } while (0); | |||
1402 | rdma->control_ready_expected = 0; | |||
1403 | } | |||
1404 | ||||
1405 | if (wr_id == RDMA_WRID_RDMA_WRITE) { | |||
1406 | uint64_t chunk = | |||
1407 | (wc.wr_id & RDMA_WRID_CHUNK_MASK(~(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL )) & ~((1UL << 16UL) - 1UL))) >> RDMA_WRID_CHUNK_SHIFT30UL; | |||
1408 | uint64_t index = | |||
1409 | (wc.wr_id & RDMA_WRID_BLOCK_MASK(~((1UL << 16UL) - 1UL) & ((1UL << 30UL) - 1UL ))) >> RDMA_WRID_BLOCK_SHIFT16UL; | |||
1410 | RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); | |||
1411 | ||||
1412 | DDDPRINTF("completions %s (%" PRId64 ") left %d, "do { } while (0) | |||
1413 | "block %" PRIu64 ", chunk: %" PRIu64 " %p %p\n",do { } while (0) | |||
1414 | print_wrid(wr_id), wr_id, rdma->nb_sent, index, chunk,do { } while (0) | |||
1415 | block->local_host_addr, (void *)block->remote_host_addr)do { } while (0); | |||
1416 | ||||
1417 | clear_bit(chunk, block->transit_bitmap); | |||
1418 | ||||
1419 | if (rdma->nb_sent > 0) { | |||
1420 | rdma->nb_sent--; | |||
1421 | } | |||
1422 | ||||
1423 | if (!rdma->pin_all) { | |||
1424 | /* | |||
1425 | * FYI: If one wanted to signal a specific chunk to be unregistered | |||
1426 | * using LRU or workload-specific information, this is the function | |||
1427 | * you would call to do so. That chunk would then get asynchronously | |||
1428 | * unregistered later. | |||
1429 | */ | |||
1430 | #ifdef RDMA_UNREGISTRATION_EXAMPLE | |||
1431 | qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id); | |||
1432 | #endif | |||
1433 | } | |||
1434 | } else { | |||
1435 | DDDPRINTF("other completion %s (%" PRId64 ") received left %d\n",do { } while (0) | |||
1436 | print_wrid(wr_id), wr_id, rdma->nb_sent)do { } while (0); | |||
1437 | } | |||
1438 | ||||
1439 | *wr_id_out = wc.wr_id; | |||
1440 | if (byte_len) { | |||
1441 | *byte_len = wc.byte_len; | |||
1442 | } | |||
1443 | ||||
1444 | return 0; | |||
1445 | } | |||
1446 | ||||
1447 | /* | |||
1448 | * Block until the next work request has completed. | |||
1449 | * | |||
1450 | * First poll to see if a work request has already completed, | |||
1451 | * otherwise block. | |||
1452 | * | |||
1453 | * If we encounter completed work requests for IDs other than | |||
1454 | * the one we're interested in, then that's generally an error. | |||
1455 | * | |||
1456 | * The only exception is actual RDMA Write completions. These | |||
1457 | * completions only need to be recorded, but do not actually | |||
1458 | * need further processing. | |||
1459 | */ | |||
1460 | static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, | |||
1461 | uint32_t *byte_len) | |||
1462 | { | |||
1463 | int num_cq_events = 0, ret = 0; | |||
1464 | struct ibv_cq *cq; | |||
1465 | void *cq_ctx; | |||
1466 | uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; | |||
1467 | ||||
1468 | if (ibv_req_notify_cq(rdma->cq, 0)) { | |||
1469 | return -1; | |||
1470 | } | |||
1471 | /* poll cq first */ | |||
1472 | while (wr_id != wrid_requested) { | |||
1473 | ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); | |||
1474 | if (ret < 0) { | |||
1475 | return ret; | |||
1476 | } | |||
1477 | ||||
1478 | wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL); | |||
1479 | ||||
1480 | if (wr_id == RDMA_WRID_NONE) { | |||
1481 | break; | |||
1482 | } | |||
1483 | if (wr_id != wrid_requested) { | |||
1484 | DDDPRINTF("A Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",do { } while (0) | |||
1485 | print_wrid(wrid_requested),do { } while (0) | |||
1486 | wrid_requested, print_wrid(wr_id), wr_id)do { } while (0); | |||
1487 | } | |||
1488 | } | |||
1489 | ||||
1490 | if (wr_id == wrid_requested) { | |||
1491 | return 0; | |||
1492 | } | |||
1493 | ||||
1494 | while (1) { | |||
1495 | /* | |||
1496 | * Coroutine doesn't start until process_incoming_migration() | |||
1497 | * so don't yield unless we know we're running inside of a coroutine. | |||
1498 | */ | |||
1499 | if (rdma->migration_started_on_destination) { | |||
1500 | yield_until_fd_readable(rdma->comp_channel->fd); | |||
1501 | } | |||
1502 | ||||
1503 | if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) { | |||
1504 | perror("ibv_get_cq_event"); | |||
1505 | goto err_block_for_wrid; | |||
1506 | } | |||
1507 | ||||
1508 | num_cq_events++; | |||
1509 | ||||
1510 | if (ibv_req_notify_cq(cq, 0)) { | |||
1511 | goto err_block_for_wrid; | |||
1512 | } | |||
1513 | ||||
1514 | while (wr_id != wrid_requested) { | |||
1515 | ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); | |||
1516 | if (ret < 0) { | |||
1517 | goto err_block_for_wrid; | |||
1518 | } | |||
1519 | ||||
1520 | wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL); | |||
1521 | ||||
1522 | if (wr_id == RDMA_WRID_NONE) { | |||
1523 | break; | |||
1524 | } | |||
1525 | if (wr_id != wrid_requested) { | |||
1526 | DDDPRINTF("B Wanted wrid %s (%d) but got %s (%" PRIu64 ")\n",do { } while (0) | |||
1527 | print_wrid(wrid_requested), wrid_requested,do { } while (0) | |||
1528 | print_wrid(wr_id), wr_id)do { } while (0); | |||
1529 | } | |||
1530 | } | |||
1531 | ||||
1532 | if (wr_id == wrid_requested) { | |||
1533 | goto success_block_for_wrid; | |||
1534 | } | |||
1535 | } | |||
1536 | ||||
1537 | success_block_for_wrid: | |||
1538 | if (num_cq_events) { | |||
1539 | ibv_ack_cq_events(cq, num_cq_events); | |||
1540 | } | |||
1541 | return 0; | |||
1542 | ||||
1543 | err_block_for_wrid: | |||
1544 | if (num_cq_events) { | |||
1545 | ibv_ack_cq_events(cq, num_cq_events); | |||
1546 | } | |||
1547 | return ret; | |||
1548 | } | |||
1549 | ||||
1550 | /* | |||
1551 | * Post a SEND message work request for the control channel | |||
1552 | * containing some data and block until the post completes. | |||
1553 | */ | |||
1554 | static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, | |||
1555 | RDMAControlHeader *head) | |||
1556 | { | |||
1557 | int ret = 0; | |||
1558 | RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; | |||
1559 | struct ibv_send_wr *bad_wr; | |||
1560 | struct ibv_sge sge = { | |||
1561 | .addr = (uint64_t)(wr->control), | |||
1562 | .length = head->len + sizeof(RDMAControlHeader), | |||
1563 | .lkey = wr->control_mr->lkey, | |||
1564 | }; | |||
1565 | struct ibv_send_wr send_wr = { | |||
1566 | .wr_id = RDMA_WRID_SEND_CONTROL, | |||
1567 | .opcode = IBV_WR_SEND, | |||
1568 | .send_flags = IBV_SEND_SIGNALED, | |||
1569 | .sg_list = &sge, | |||
1570 | .num_sge = 1, | |||
1571 | }; | |||
1572 | ||||
1573 | DDDPRINTF("CONTROL: sending %s..\n", control_desc[head->type])do { } while (0); | |||
1574 | ||||
1575 | /* | |||
1576 | * We don't actually need to do a memcpy() in here if we used | |||
1577 | * the "sge" properly, but since we're only sending control messages | |||
1578 | * (not RAM in a performance-critical path), then its OK for now. | |||
1579 | * | |||
1580 | * The copy makes the RDMAControlHeader simpler to manipulate | |||
1581 | * for the time being. | |||
1582 | */ | |||
1583 | assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head))((head->len <= (512 * 1024) - sizeof(*head)) ? (void) ( 0) : __assert_fail ("head->len <= (512 * 1024) - sizeof(*head)" , "/home/stefan/src/qemu/qemu.org/qemu/migration-rdma.c", 1583 , __PRETTY_FUNCTION__)); | |||
1584 | memcpy(wr->control, head, sizeof(RDMAControlHeader)); | |||
1585 | control_to_network((void *) wr->control); | |||
1586 | ||||
1587 | if (buf) { | |||
1588 | memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len); | |||
1589 | } | |||
1590 | ||||
1591 | ||||
1592 | if (ibv_post_send(rdma->qp, &send_wr, &bad_wr)) { | |||
1593 | return -1; | |||
1594 | } | |||
1595 | ||||
1596 | if (ret < 0) { | |||
1597 | fprintf(stderrstderr, "Failed to use post IB SEND for control!\n"); | |||
1598 | return ret; | |||
1599 | } | |||
1600 | ||||
1601 | ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL((void*)0)); | |||
1602 | if (ret < 0) { | |||
1603 | fprintf(stderrstderr, "rdma migration: send polling control error!\n"); | |||
1604 | } | |||
1605 | ||||
1606 | return ret; | |||
1607 | } | |||
1608 | ||||
1609 | /* | |||
1610 | * Post a RECV work request in anticipation of some future receipt | |||
1611 | * of data on the control channel. | |||
1612 | */ | |||
1613 | static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx) | |||
1614 | { | |||
1615 | struct ibv_recv_wr *bad_wr; | |||
1616 | struct ibv_sge sge = { | |||
1617 | .addr = (uint64_t)(rdma->wr_data[idx].control), | |||
1618 | .length = RDMA_CONTROL_MAX_BUFFER(512 * 1024), | |||
1619 | .lkey = rdma->wr_data[idx].control_mr->lkey, | |||
1620 | }; | |||
1621 | ||||
1622 | struct ibv_recv_wr recv_wr = { | |||
1623 | .wr_id = RDMA_WRID_RECV_CONTROL + idx, | |||
1624 | .sg_list = &sge, | |||
1625 | .num_sge = 1, | |||
1626 | }; | |||
1627 | ||||
1628 | ||||
1629 | if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) { | |||
1630 | return -1; | |||
1631 | } | |||
1632 | ||||
1633 | return 0; | |||
1634 | } | |||
1635 | ||||
1636 | /* | |||
1637 | * Block and wait for a RECV control channel message to arrive. | |||
1638 | */ | |||
1639 | static int qemu_rdma_exchange_get_response(RDMAContext *rdma, | |||
1640 | RDMAControlHeader *head, int expecting, int idx) | |||
1641 | { | |||
1642 | uint32_t byte_len; | |||
1643 | int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx, | |||
1644 | &byte_len); | |||
1645 | ||||
1646 | if (ret < 0) { | |||
1647 | fprintf(stderrstderr, "rdma migration: recv polling control error!\n"); | |||
1648 | return ret; | |||
1649 | } | |||
1650 | ||||
1651 | network_to_control((void *) rdma->wr_data[idx].control); | |||
1652 | memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader)); | |||
1653 | ||||
1654 | DDDPRINTF("CONTROL: %s receiving...\n", control_desc[expecting])do { } while (0); | |||
1655 | ||||
1656 | if (expecting == RDMA_CONTROL_NONE) { | |||
1657 | DDDPRINTF("Surprise: got %s (%d)\n",do { } while (0) | |||
1658 | control_desc[head->type], head->type)do { } while (0); | |||
1659 | } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { | |||
1660 | fprintf(stderrstderr, "Was expecting a %s (%d) control message" | |||
1661 | ", but got: %s (%d), length: %d\n", | |||
1662 | control_desc[expecting], expecting, | |||
1663 | control_desc[head->type], head->type, head->len); | |||
1664 | return -EIO5; | |||
1665 | } | |||
1666 | if (head->len > RDMA_CONTROL_MAX_BUFFER(512 * 1024) - sizeof(*head)) { | |||
1667 | fprintf(stderrstderr, "too long length: %d\n", head->len); | |||
1668 | return -EINVAL22; | |||
1669 | } | |||
1670 | if (sizeof(*head) + head->len != byte_len) { | |||
1671 | fprintf(stderrstderr, "Malformed length: %d byte_len %d\n", | |||
1672 | head->len, byte_len); | |||
1673 | return -EINVAL22; | |||
1674 | } | |||
1675 | ||||
1676 | return 0; | |||
1677 | } | |||
1678 | ||||
1679 | /* | |||
1680 | * When a RECV work request has completed, the work request's | |||
1681 | * buffer is pointed at the header. | |||
1682 | * | |||
1683 | * This will advance the pointer to the data portion | |||
1684 | * of the control message of the work request's buffer that | |||
1685 | * was populated after the work request finished. | |||
1686 | */ | |||
1687 | static void qemu_rdma_move_header(RDMAContext *rdma, int idx, | |||
1688 | RDMAControlHeader *head) | |||
1689 | { | |||
1690 | rdma->wr_data[idx].control_len = head->len; | |||
1691 | rdma->wr_data[idx].control_curr = | |||
1692 | rdma->wr_data[idx].control + sizeof(RDMAControlHeader); | |||
1693 | } | |||
1694 | ||||
1695 | /* | |||
1696 | * This is an 'atomic' high-level operation to deliver a single, unified | |||
1697 | * control-channel message. | |||
1698 | * | |||
1699 | * Additionally, if the user is expecting some kind of reply to this message, | |||
1700 | * they can request a 'resp' response message be filled in by posting an | |||
1701 | * additional work request on behalf of the user and waiting for an additional | |||
1702 | * completion. | |||
1703 | * | |||
1704 | * The extra (optional) response is used during registration to us from having | |||
1705 | * to perform an *additional* exchange of message just to provide a response by | |||
1706 | * instead piggy-backing on the acknowledgement. | |||
1707 | */ | |||
1708 | static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, | |||
1709 | uint8_t *data, RDMAControlHeader *resp, | |||
1710 | int *resp_idx, | |||
1711 | int (*callback)(RDMAContext *rdma)) | |||
1712 | { | |||
1713 | int ret = 0; | |||
1714 | ||||
1715 | /* | |||
1716 | * Wait until the dest is ready before attempting to deliver the message | |||
1717 | * by waiting for a READY message. | |||
1718 | */ | |||
1719 | if (rdma->control_ready_expected) { | |||
1720 | RDMAControlHeader resp; | |||
1721 | ret = qemu_rdma_exchange_get_response(rdma, | |||
1722 | &resp, RDMA_CONTROL_READY, RDMA_WRID_READY); | |||
1723 | if (ret < 0) { | |||
1724 | return ret; | |||
1725 | } | |||
1726 | } | |||
1727 | ||||
1728 | /* | |||
1729 | * If the user is expecting a response, post a WR in anticipation of it. | |||
1730 | */ | |||
1731 | if (resp) { | |||
1732 | ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA); | |||
1733 | if (ret) { | |||
1734 | fprintf(stderrstderr, "rdma migration: error posting" | |||
1735 | " extra control recv for anticipated result!"); | |||
1736 | return ret; | |||
1737 | } | |||
1738 | } | |||
1739 | ||||
1740 | /* | |||
1741 | * Post a WR to replace the one we just consumed for the READY message. | |||
1742 | */ | |||
1743 | ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); | |||
1744 | if (ret) { | |||
1745 | fprintf(stderrstderr, "rdma migration: error posting first control recv!"); | |||
1746 | return ret; | |||
1747 | } | |||
1748 | ||||
1749 | /* | |||
1750 | * Deliver the control message that was requested. | |||
1751 | */ | |||
1752 | ret = qemu_rdma_post_send_control(rdma, data, head); | |||
1753 | ||||
1754 | if (ret < 0) { | |||
1755 | fprintf(stderrstderr, "Failed to send control buffer!\n"); | |||
1756 | return ret; | |||
1757 | } | |||
1758 | ||||
1759 | /* | |||
1760 | * If we're expecting a response, block and wait for it. | |||
1761 | */ | |||
1762 | if (resp) { | |||
1763 | if (callback) { | |||
1764 | DDPRINTF("Issuing callback before receiving response...\n")do { } while (0); | |||
1765 | ret = callback(rdma); | |||
1766 | if (ret < 0) { | |||
1767 | return ret; | |||
1768 | } | |||
1769 | } | |||
1770 | ||||
1771 | DDPRINTF("Waiting for response %s\n", control_desc[resp->type])do { } while (0); | |||
1772 | ret = qemu_rdma_exchange_get_response(rdma, resp, | |||
1773 | resp->type, RDMA_WRID_DATA); | |||
1774 | ||||
1775 | if (ret < 0) { | |||
1776 | return ret; | |||
1777 | } | |||
1778 | ||||
1779 | qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); | |||
1780 | if (resp_idx) { | |||
1781 | *resp_idx = RDMA_WRID_DATA; | |||
1782 | } | |||
1783 | DDPRINTF("Response %s received.\n", control_desc[resp->type])do { } while (0); | |||
1784 | } | |||
1785 | ||||
1786 | rdma->control_ready_expected = 1; | |||
1787 | ||||
1788 | return 0; | |||
1789 | } | |||
1790 | ||||
1791 | /* | |||
1792 | * This is an 'atomic' high-level operation to receive a single, unified | |||
1793 | * control-channel message. | |||
1794 | */ | |||
1795 | static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, | |||
1796 | int expecting) | |||
1797 | { | |||
1798 | RDMAControlHeader ready = { | |||
1799 | .len = 0, | |||
1800 | .type = RDMA_CONTROL_READY, | |||
1801 | .repeat = 1, | |||
1802 | }; | |||
1803 | int ret; | |||
1804 | ||||
1805 | /* | |||
1806 | * Inform the source that we're ready to receive a message. | |||
1807 | */ | |||
1808 | ret = qemu_rdma_post_send_control(rdma, NULL((void*)0), &ready); | |||
1809 | ||||
1810 | if (ret < 0) { | |||
1811 | fprintf(stderrstderr, "Failed to send control buffer!\n"); | |||
1812 | return ret; | |||
1813 | } | |||
1814 | ||||
1815 | /* | |||
1816 | * Block and wait for the message. | |||
1817 | */ | |||
1818 | ret = qemu_rdma_exchange_get_response(rdma, head, | |||
1819 | expecting, RDMA_WRID_READY); | |||
1820 | ||||
1821 | if (ret < 0) { | |||
1822 | return ret; | |||
1823 | } | |||
1824 | ||||
1825 | qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); | |||
1826 | ||||
1827 | /* | |||
1828 | * Post a new RECV work request to replace the one we just consumed. | |||
1829 | */ | |||
1830 | ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); | |||
1831 | if (ret) { | |||
1832 | fprintf(stderrstderr, "rdma migration: error posting second control recv!"); | |||
1833 | return ret; | |||
1834 | } | |||
1835 | ||||
1836 | return 0; | |||
1837 | } | |||
1838 | ||||
1839 | /* | |||
1840 | * Write an actual chunk of memory using RDMA. | |||
1841 | * | |||
1842 | * If we're using dynamic registration on the dest-side, we have to | |||
1843 | * send a registration command first. | |||
1844 | */ | |||
1845 | static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma, | |||
1846 | int current_index, uint64_t current_addr, | |||
1847 | uint64_t length) | |||
1848 | { | |||
1849 | struct ibv_sge sge; | |||
1850 | struct ibv_send_wr send_wr = { 0 }; | |||
1851 | struct ibv_send_wr *bad_wr; | |||
1852 | int reg_result_idx, ret, count = 0; | |||
1853 | uint64_t chunk, chunks; | |||
1854 | uint8_t *chunk_start, *chunk_end; | |||
1855 | RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); | |||
1856 | RDMARegister reg; | |||
1857 | RDMARegisterResult *reg_result; | |||
1858 | RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT }; | |||
1859 | RDMAControlHeader head = { .len = sizeof(RDMARegister), | |||
1860 | .type = RDMA_CONTROL_REGISTER_REQUEST, | |||
1861 | .repeat = 1, | |||
1862 | }; | |||
1863 | ||||
1864 | retry: | |||
1865 | sge.addr = (uint64_t)(block->local_host_addr + | |||
1866 | (current_addr - block->offset)); | |||
1867 | sge.length = length; | |||
1868 | ||||
1869 | chunk = ram_chunk_index(block->local_host_addr, (uint8_t *) sge.addr); | |||
1870 | chunk_start = ram_chunk_start(block, chunk); | |||
1871 | ||||
1872 | if (block->is_ram_block) { | |||
1873 | chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT20); | |||
1874 | ||||
1875 | if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT20)) == 0)) { | |||
1876 | chunks--; | |||
1877 | } | |||
1878 | } else { | |||
1879 | chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT20); | |||
1880 | ||||
1881 | if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT20)) == 0)) { | |||
1882 | chunks--; | |||
1883 | } | |||
1884 | } | |||
1885 | ||||
1886 | DDPRINTF("Writing %" PRIu64 " chunks, (%" PRIu64 " MB)\n",do { } while (0) | |||
1887 | chunks + 1, (chunks + 1) * (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024)do { } while (0); | |||
1888 | ||||
1889 | chunk_end = ram_chunk_end(block, chunk + chunks); | |||
1890 | ||||
1891 | if (!rdma->pin_all) { | |||
1892 | #ifdef RDMA_UNREGISTRATION_EXAMPLE | |||
1893 | qemu_rdma_unregister_waiting(rdma); | |||
1894 | #endif | |||
1895 | } | |||
1896 | ||||
1897 | while (test_bit(chunk, block->transit_bitmap)) { | |||
1898 | (void)count; | |||
1899 | DDPRINTF("(%d) Not clobbering: block: %d chunk %" PRIu64do { } while (0) | |||
1900 | " current %" PRIu64 " len %" PRIu64 " %d %d\n",do { } while (0) | |||
1901 | count++, current_index, chunk,do { } while (0) | |||
1902 | sge.addr, length, rdma->nb_sent, block->nb_chunks)do { } while (0); | |||
1903 | ||||
1904 | ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0)); | |||
1905 | ||||
1906 | if (ret < 0) { | |||
1907 | fprintf(stderrstderr, "Failed to Wait for previous write to complete " | |||
1908 | "block %d chunk %" PRIu64"l" "u" | |||
1909 | " current %" PRIu64"l" "u" " len %" PRIu64"l" "u" " %d\n", | |||
1910 | current_index, chunk, sge.addr, length, rdma->nb_sent); | |||
1911 | return ret; | |||
1912 | } | |||
1913 | } | |||
1914 | ||||
1915 | if (!rdma->pin_all || !block->is_ram_block) { | |||
1916 | if (!block->remote_keys[chunk]) { | |||
1917 | /* | |||
1918 | * This chunk has not yet been registered, so first check to see | |||
1919 | * if the entire chunk is zero. If so, tell the other size to | |||
1920 | * memset() + madvise() the entire chunk without RDMA. | |||
1921 | */ | |||
1922 | ||||
1923 | if (can_use_buffer_find_nonzero_offset((void *)sge.addr, length) | |||
1924 | && buffer_find_nonzero_offset((void *)sge.addr, | |||
1925 | length) == length) { | |||
1926 | RDMACompress comp = { | |||
1927 | .offset = current_addr, | |||
1928 | .value = 0, | |||
1929 | .block_idx = current_index, | |||
1930 | .length = length, | |||
1931 | }; | |||
1932 | ||||
1933 | head.len = sizeof(comp); | |||
1934 | head.type = RDMA_CONTROL_COMPRESS; | |||
1935 | ||||
1936 | DDPRINTF("Entire chunk is zero, sending compress: %"do { } while (0) | |||
1937 | PRIu64 " for %d "do { } while (0) | |||
1938 | "bytes, index: %d, offset: %" PRId64 "...\n",do { } while (0) | |||
1939 | chunk, sge.length, current_index, current_addr)do { } while (0); | |||
1940 | ||||
1941 | compress_to_network(&comp); | |||
1942 | ret = qemu_rdma_exchange_send(rdma, &head, | |||
1943 | (uint8_t *) &comp, NULL((void*)0), NULL((void*)0), NULL((void*)0)); | |||
1944 | ||||
1945 | if (ret < 0) { | |||
1946 | return -EIO5; | |||
1947 | } | |||
1948 | ||||
1949 | acct_update_position(f, sge.length, true1); | |||
1950 | ||||
1951 | return 1; | |||
1952 | } | |||
1953 | ||||
1954 | /* | |||
1955 | * Otherwise, tell other side to register. | |||
1956 | */ | |||
1957 | reg.current_index = current_index; | |||
1958 | if (block->is_ram_block) { | |||
1959 | reg.key.current_addr = current_addr; | |||
1960 | } else { | |||
1961 | reg.key.chunk = chunk; | |||
1962 | } | |||
1963 | reg.chunks = chunks; | |||
1964 | ||||
1965 | DDPRINTF("Sending registration request chunk %" PRIu64 " for %d "do { } while (0) | |||
1966 | "bytes, index: %d, offset: %" PRId64 "...\n",do { } while (0) | |||
1967 | chunk, sge.length, current_index, current_addr)do { } while (0); | |||
1968 | ||||
1969 | register_to_network(®); | |||
1970 | ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, | |||
1971 | &resp, ®_result_idx, NULL((void*)0)); | |||
1972 | if (ret < 0) { | |||
1973 | return ret; | |||
1974 | } | |||
1975 | ||||
1976 | /* try to overlap this single registration with the one we sent. */ | |||
1977 | if (qemu_rdma_register_and_get_keys(rdma, block, | |||
1978 | (uint8_t *) sge.addr, | |||
1979 | &sge.lkey, NULL((void*)0), chunk, | |||
1980 | chunk_start, chunk_end)) { | |||
1981 | fprintf(stderrstderr, "cannot get lkey!\n"); | |||
1982 | return -EINVAL22; | |||
1983 | } | |||
1984 | ||||
1985 | reg_result = (RDMARegisterResult *) | |||
1986 | rdma->wr_data[reg_result_idx].control_curr; | |||
1987 | ||||
1988 | network_to_result(reg_result); | |||
1989 | ||||
1990 | DDPRINTF("Received registration result:"do { } while (0) | |||
1991 | " my key: %x their key %x, chunk %" PRIu64 "\n",do { } while (0) | |||
1992 | block->remote_keys[chunk], reg_result->rkey, chunk)do { } while (0); | |||
1993 | ||||
1994 | block->remote_keys[chunk] = reg_result->rkey; | |||
1995 | block->remote_host_addr = reg_result->host_addr; | |||
1996 | } else { | |||
1997 | /* already registered before */ | |||
1998 | if (qemu_rdma_register_and_get_keys(rdma, block, | |||
1999 | (uint8_t *)sge.addr, | |||
2000 | &sge.lkey, NULL((void*)0), chunk, | |||
2001 | chunk_start, chunk_end)) { | |||
2002 | fprintf(stderrstderr, "cannot get lkey!\n"); | |||
2003 | return -EINVAL22; | |||
2004 | } | |||
2005 | } | |||
2006 | ||||
2007 | send_wr.wr.rdma.rkey = block->remote_keys[chunk]; | |||
2008 | } else { | |||
2009 | send_wr.wr.rdma.rkey = block->remote_rkey; | |||
2010 | ||||
2011 | if (qemu_rdma_register_and_get_keys(rdma, block, (uint8_t *)sge.addr, | |||
2012 | &sge.lkey, NULL((void*)0), chunk, | |||
2013 | chunk_start, chunk_end)) { | |||
2014 | fprintf(stderrstderr, "cannot get lkey!\n"); | |||
2015 | return -EINVAL22; | |||
2016 | } | |||
2017 | } | |||
2018 | ||||
2019 | /* | |||
2020 | * Encode the ram block index and chunk within this wrid. | |||
2021 | * We will use this information at the time of completion | |||
2022 | * to figure out which bitmap to check against and then which | |||
2023 | * chunk in the bitmap to look for. | |||
2024 | */ | |||
2025 | send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE, | |||
2026 | current_index, chunk); | |||
2027 | ||||
2028 | send_wr.opcode = IBV_WR_RDMA_WRITE; | |||
2029 | send_wr.send_flags = IBV_SEND_SIGNALED; | |||
2030 | send_wr.sg_list = &sge; | |||
2031 | send_wr.num_sge = 1; | |||
2032 | send_wr.wr.rdma.remote_addr = block->remote_host_addr + | |||
2033 | (current_addr - block->offset); | |||
2034 | ||||
2035 | DDDPRINTF("Posting chunk: %" PRIu64 ", addr: %lx"do { } while (0) | |||
2036 | " remote: %lx, bytes %" PRIu32 "\n",do { } while (0) | |||
2037 | chunk, sge.addr, send_wr.wr.rdma.remote_addr,do { } while (0) | |||
2038 | sge.length)do { } while (0); | |||
2039 | ||||
2040 | /* | |||
2041 | * ibv_post_send() does not return negative error numbers, | |||
2042 | * per the specification they are positive - no idea why. | |||
2043 | */ | |||
2044 | ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); | |||
2045 | ||||
2046 | if (ret == ENOMEM12) { | |||
2047 | DDPRINTF("send queue is full. wait a little....\n")do { } while (0); | |||
2048 | ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0)); | |||
2049 | if (ret < 0) { | |||
2050 | fprintf(stderrstderr, "rdma migration: failed to make " | |||
2051 | "room in full send queue! %d\n", ret); | |||
2052 | return ret; | |||
2053 | } | |||
2054 | ||||
2055 | goto retry; | |||
2056 | ||||
2057 | } else if (ret > 0) { | |||
2058 | perror("rdma migration: post rdma write failed"); | |||
2059 | return -ret; | |||
2060 | } | |||
2061 | ||||
2062 | set_bit(chunk, block->transit_bitmap); | |||
2063 | acct_update_position(f, sge.length, false0); | |||
2064 | rdma->total_writes++; | |||
2065 | ||||
2066 | return 0; | |||
2067 | } | |||
2068 | ||||
2069 | /* | |||
2070 | * Push out any unwritten RDMA operations. | |||
2071 | * | |||
2072 | * We support sending out multiple chunks at the same time. | |||
2073 | * Not all of them need to get signaled in the completion queue. | |||
2074 | */ | |||
2075 | static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma) | |||
2076 | { | |||
2077 | int ret; | |||
2078 | ||||
2079 | if (!rdma->current_length) { | |||
2080 | return 0; | |||
2081 | } | |||
2082 | ||||
2083 | ret = qemu_rdma_write_one(f, rdma, | |||
2084 | rdma->current_index, rdma->current_addr, rdma->current_length); | |||
2085 | ||||
2086 | if (ret < 0) { | |||
2087 | return ret; | |||
2088 | } | |||
2089 | ||||
2090 | if (ret == 0) { | |||
2091 | rdma->nb_sent++; | |||
2092 | DDDPRINTF("sent total: %d\n", rdma->nb_sent)do { } while (0); | |||
2093 | } | |||
2094 | ||||
2095 | rdma->current_length = 0; | |||
2096 | rdma->current_addr = 0; | |||
2097 | ||||
2098 | return 0; | |||
2099 | } | |||
2100 | ||||
2101 | static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma, | |||
2102 | uint64_t offset, uint64_t len) | |||
2103 | { | |||
2104 | RDMALocalBlock *block; | |||
2105 | uint8_t *host_addr; | |||
2106 | uint8_t *chunk_end; | |||
2107 | ||||
2108 | if (rdma->current_index < 0) { | |||
2109 | return 0; | |||
2110 | } | |||
2111 | ||||
2112 | if (rdma->current_chunk < 0) { | |||
2113 | return 0; | |||
2114 | } | |||
2115 | ||||
2116 | block = &(rdma->local_ram_blocks.block[rdma->current_index]); | |||
2117 | host_addr = block->local_host_addr + (offset - block->offset); | |||
2118 | chunk_end = ram_chunk_end(block, rdma->current_chunk); | |||
2119 | ||||
2120 | if (rdma->current_length == 0) { | |||
2121 | return 0; | |||
2122 | } | |||
2123 | ||||
2124 | /* | |||
2125 | * Only merge into chunk sequentially. | |||
2126 | */ | |||
2127 | if (offset != (rdma->current_addr + rdma->current_length)) { | |||
2128 | return 0; | |||
2129 | } | |||
2130 | ||||
2131 | if (offset < block->offset) { | |||
2132 | return 0; | |||
2133 | } | |||
2134 | ||||
2135 | if ((offset + len) > (block->offset + block->length)) { | |||
2136 | return 0; | |||
2137 | } | |||
2138 | ||||
2139 | if ((host_addr + len) > chunk_end) { | |||
2140 | return 0; | |||
2141 | } | |||
2142 | ||||
2143 | return 1; | |||
2144 | } | |||
2145 | ||||
2146 | /* | |||
2147 | * We're not actually writing here, but doing three things: | |||
2148 | * | |||
2149 | * 1. Identify the chunk the buffer belongs to. | |||
2150 | * 2. If the chunk is full or the buffer doesn't belong to the current | |||
2151 | * chunk, then start a new chunk and flush() the old chunk. | |||
2152 | * 3. To keep the hardware busy, we also group chunks into batches | |||
2153 | * and only require that a batch gets acknowledged in the completion | |||
2154 | * qeueue instead of each individual chunk. | |||
2155 | */ | |||
2156 | static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma, | |||
2157 | uint64_t block_offset, uint64_t offset, | |||
2158 | uint64_t len) | |||
2159 | { | |||
2160 | uint64_t current_addr = block_offset + offset; | |||
2161 | uint64_t index = rdma->current_index; | |||
2162 | uint64_t chunk = rdma->current_chunk; | |||
2163 | int ret; | |||
2164 | ||||
2165 | /* If we cannot merge it, we flush the current buffer first. */ | |||
2166 | if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) { | |||
2167 | ret = qemu_rdma_write_flush(f, rdma); | |||
2168 | if (ret) { | |||
2169 | return ret; | |||
2170 | } | |||
2171 | rdma->current_length = 0; | |||
2172 | rdma->current_addr = current_addr; | |||
2173 | ||||
2174 | ret = qemu_rdma_search_ram_block(rdma, block_offset, | |||
2175 | offset, len, &index, &chunk); | |||
2176 | if (ret) { | |||
2177 | fprintf(stderrstderr, "ram block search failed\n"); | |||
2178 | return ret; | |||
2179 | } | |||
2180 | rdma->current_index = index; | |||
2181 | rdma->current_chunk = chunk; | |||
2182 | } | |||
2183 | ||||
2184 | /* merge it */ | |||
2185 | rdma->current_length += len; | |||
2186 | ||||
2187 | /* flush it if buffer is too large */ | |||
2188 | if (rdma->current_length >= RDMA_MERGE_MAX(2 * 1024 * 1024)) { | |||
2189 | return qemu_rdma_write_flush(f, rdma); | |||
2190 | } | |||
2191 | ||||
2192 | return 0; | |||
2193 | } | |||
2194 | ||||
2195 | static void qemu_rdma_cleanup(RDMAContext *rdma) | |||
2196 | { | |||
2197 | struct rdma_cm_event *cm_event; | |||
2198 | int ret, idx; | |||
2199 | ||||
2200 | if (rdma->cm_id && rdma->connected) { | |||
2201 | if (rdma->error_state) { | |||
2202 | RDMAControlHeader head = { .len = 0, | |||
2203 | .type = RDMA_CONTROL_ERROR, | |||
2204 | .repeat = 1, | |||
2205 | }; | |||
2206 | fprintf(stderrstderr, "Early error. Sending error.\n"); | |||
2207 | qemu_rdma_post_send_control(rdma, NULL((void*)0), &head); | |||
2208 | } | |||
2209 | ||||
2210 | ret = rdma_disconnect(rdma->cm_id); | |||
2211 | if (!ret) { | |||
2212 | DDPRINTF("waiting for disconnect\n")do { } while (0); | |||
2213 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
2214 | if (!ret) { | |||
2215 | rdma_ack_cm_event(cm_event); | |||
2216 | } | |||
2217 | } | |||
2218 | DDPRINTF("Disconnected.\n")do { } while (0); | |||
2219 | rdma->connected = false0; | |||
2220 | } | |||
2221 | ||||
2222 | g_free(rdma->block); | |||
2223 | rdma->block = NULL((void*)0); | |||
2224 | ||||
2225 | for (idx = 0; idx < RDMA_WRID_MAX; idx++) { | |||
2226 | if (rdma->wr_data[idx].control_mr) { | |||
2227 | rdma->total_registrations--; | |||
2228 | ibv_dereg_mr(rdma->wr_data[idx].control_mr); | |||
2229 | } | |||
2230 | rdma->wr_data[idx].control_mr = NULL((void*)0); | |||
2231 | } | |||
2232 | ||||
2233 | if (rdma->local_ram_blocks.block) { | |||
2234 | while (rdma->local_ram_blocks.nb_blocks) { | |||
2235 | __qemu_rdma_delete_block(rdma, | |||
2236 | rdma->local_ram_blocks.block->offset); | |||
| ||||
2237 | } | |||
2238 | } | |||
2239 | ||||
2240 | if (rdma->qp) { | |||
2241 | rdma_destroy_qp(rdma->cm_id); | |||
2242 | rdma->qp = NULL((void*)0); | |||
2243 | } | |||
2244 | if (rdma->cq) { | |||
2245 | ibv_destroy_cq(rdma->cq); | |||
2246 | rdma->cq = NULL((void*)0); | |||
2247 | } | |||
2248 | if (rdma->comp_channel) { | |||
2249 | ibv_destroy_comp_channel(rdma->comp_channel); | |||
2250 | rdma->comp_channel = NULL((void*)0); | |||
2251 | } | |||
2252 | if (rdma->pd) { | |||
2253 | ibv_dealloc_pd(rdma->pd); | |||
2254 | rdma->pd = NULL((void*)0); | |||
2255 | } | |||
2256 | if (rdma->listen_id) { | |||
2257 | rdma_destroy_id(rdma->listen_id); | |||
2258 | rdma->listen_id = NULL((void*)0); | |||
2259 | } | |||
2260 | if (rdma->cm_id) { | |||
2261 | rdma_destroy_id(rdma->cm_id); | |||
2262 | rdma->cm_id = NULL((void*)0); | |||
2263 | } | |||
2264 | if (rdma->channel) { | |||
2265 | rdma_destroy_event_channel(rdma->channel); | |||
2266 | rdma->channel = NULL((void*)0); | |||
2267 | } | |||
2268 | g_free(rdma->host); | |||
2269 | rdma->host = NULL((void*)0); | |||
2270 | } | |||
2271 | ||||
2272 | ||||
2273 | static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool_Bool pin_all) | |||
2274 | { | |||
2275 | int ret, idx; | |||
2276 | Error *local_err = NULL((void*)0), **temp = &local_err; | |||
2277 | ||||
2278 | /* | |||
2279 | * Will be validated against destination's actual capabilities | |||
2280 | * after the connect() completes. | |||
2281 | */ | |||
2282 | rdma->pin_all = pin_all; | |||
2283 | ||||
2284 | ret = qemu_rdma_resolve_host(rdma, temp); | |||
2285 | if (ret) { | |||
2286 | goto err_rdma_source_init; | |||
2287 | } | |||
2288 | ||||
2289 | ret = qemu_rdma_alloc_pd_cq(rdma); | |||
2290 | if (ret) { | |||
2291 | ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()"do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" "\n"); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" ); } } while (0) | |||
2292 | " limits may be too low. Please check $ ulimit -a # and "do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" "\n"); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" ); } } while (0) | |||
2293 | "search for 'ulimit -l' in the output")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" "\n"); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating pd and cq! Your mlock()" " limits may be too low. Please check $ ulimit -a # and " "search for 'ulimit -l' in the output" ); } } while (0); | |||
2294 | goto err_rdma_source_init; | |||
2295 | } | |||
2296 | ||||
2297 | ret = qemu_rdma_alloc_qp(rdma); | |||
2298 | if (ret) { | |||
2299 | ERROR(temp, "rdma migration: error allocating qp!")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error allocating qp!" "\n"); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error allocating qp!" ); } } while (0); | |||
2300 | goto err_rdma_source_init; | |||
2301 | } | |||
2302 | ||||
2303 | ret = qemu_rdma_init_ram_blocks(rdma); | |||
2304 | if (ret) { | |||
2305 | ERROR(temp, "rdma migration: error initializing ram blocks!")do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error initializing ram blocks!" "\n"); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error initializing ram blocks!" ); } } while (0); | |||
2306 | goto err_rdma_source_init; | |||
2307 | } | |||
2308 | ||||
2309 | for (idx = 0; idx < RDMA_WRID_MAX; idx++) { | |||
2310 | ret = qemu_rdma_reg_control(rdma, idx); | |||
2311 | if (ret) { | |||
2312 | ERROR(temp, "rdma migration: error registering %d control!",do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error registering %d control!" "\n", idx); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error registering %d control!" , idx); } } while (0) | |||
2313 | idx)do { fprintf(stderr, "RDMA ERROR: " "rdma migration: error registering %d control!" "\n", idx); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "rdma migration: error registering %d control!" , idx); } } while (0); | |||
2314 | goto err_rdma_source_init; | |||
2315 | } | |||
2316 | } | |||
2317 | ||||
2318 | return 0; | |||
2319 | ||||
2320 | err_rdma_source_init: | |||
2321 | error_propagate(errp, local_err); | |||
2322 | qemu_rdma_cleanup(rdma); | |||
2323 | return -1; | |||
2324 | } | |||
2325 | ||||
2326 | static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) | |||
2327 | { | |||
2328 | RDMACapabilities cap = { | |||
2329 | .version = RDMA_CONTROL_VERSION_CURRENT1, | |||
2330 | .flags = 0, | |||
2331 | }; | |||
2332 | struct rdma_conn_param conn_param = { .initiator_depth = 2, | |||
2333 | .retry_count = 5, | |||
2334 | .private_data = &cap, | |||
2335 | .private_data_len = sizeof(cap), | |||
2336 | }; | |||
2337 | struct rdma_cm_event *cm_event; | |||
2338 | int ret; | |||
2339 | ||||
2340 | /* | |||
2341 | * Only negotiate the capability with destination if the user | |||
2342 | * on the source first requested the capability. | |||
2343 | */ | |||
2344 | if (rdma->pin_all) { | |||
2345 | DPRINTF("Server pin-all memory requested.\n")do { } while (0); | |||
2346 | cap.flags |= RDMA_CAPABILITY_PIN_ALL0x01; | |||
2347 | } | |||
2348 | ||||
2349 | caps_to_network(&cap); | |||
2350 | ||||
2351 | ret = rdma_connect(rdma->cm_id, &conn_param); | |||
2352 | if (ret) { | |||
2353 | perror("rdma_connect"); | |||
2354 | ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!" ); } } while (0); | |||
2355 | rdma_destroy_id(rdma->cm_id); | |||
2356 | rdma->cm_id = NULL((void*)0); | |||
2357 | goto err_rdma_source_connect; | |||
2358 | } | |||
2359 | ||||
2360 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
2361 | if (ret) { | |||
2362 | perror("rdma_get_cm_event after rdma_connect"); | |||
2363 | ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!" ); } } while (0); | |||
2364 | rdma_ack_cm_event(cm_event); | |||
2365 | rdma_destroy_id(rdma->cm_id); | |||
2366 | rdma->cm_id = NULL((void*)0); | |||
2367 | goto err_rdma_source_connect; | |||
2368 | } | |||
2369 | ||||
2370 | if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { | |||
2371 | perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect"); | |||
2372 | ERROR(errp, "connecting to destination!")do { fprintf(stderr, "RDMA ERROR: " "connecting to destination!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "connecting to destination!" ); } } while (0); | |||
2373 | rdma_ack_cm_event(cm_event); | |||
2374 | rdma_destroy_id(rdma->cm_id); | |||
2375 | rdma->cm_id = NULL((void*)0); | |||
2376 | goto err_rdma_source_connect; | |||
2377 | } | |||
2378 | rdma->connected = true1; | |||
2379 | ||||
2380 | memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); | |||
2381 | network_to_caps(&cap); | |||
2382 | ||||
2383 | /* | |||
2384 | * Verify that the *requested* capabilities are supported by the destination | |||
2385 | * and disable them otherwise. | |||
2386 | */ | |||
2387 | if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL0x01)) { | |||
2388 | ERROR(errp, "Server cannot support pinning all memory. "do { fprintf(stderr, "RDMA ERROR: " "Server cannot support pinning all memory. " "Will register memory dynamically." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "Server cannot support pinning all memory. " "Will register memory dynamically."); } } while (0) | |||
2389 | "Will register memory dynamically.")do { fprintf(stderr, "RDMA ERROR: " "Server cannot support pinning all memory. " "Will register memory dynamically." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR , "RDMA ERROR: " "Server cannot support pinning all memory. " "Will register memory dynamically."); } } while (0); | |||
2390 | rdma->pin_all = false0; | |||
2391 | } | |||
2392 | ||||
2393 | DPRINTF("Pin all memory: %s\n", rdma->pin_all ? "enabled" : "disabled")do { } while (0); | |||
2394 | ||||
2395 | rdma_ack_cm_event(cm_event); | |||
2396 | ||||
2397 | ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); | |||
2398 | if (ret) { | |||
2399 | ERROR(errp, "posting second control recv!")do { fprintf(stderr, "RDMA ERROR: " "posting second control recv!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "posting second control recv!" ); } } while (0); | |||
2400 | goto err_rdma_source_connect; | |||
2401 | } | |||
2402 | ||||
2403 | rdma->control_ready_expected = 1; | |||
2404 | rdma->nb_sent = 0; | |||
2405 | return 0; | |||
2406 | ||||
2407 | err_rdma_source_connect: | |||
2408 | qemu_rdma_cleanup(rdma); | |||
2409 | return -1; | |||
2410 | } | |||
2411 | ||||
2412 | static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) | |||
2413 | { | |||
2414 | int ret = -EINVAL22, idx; | |||
2415 | struct rdma_cm_id *listen_id; | |||
2416 | char ip[40] = "unknown"; | |||
2417 | struct rdma_addrinfo *res; | |||
2418 | char port_str[16]; | |||
2419 | ||||
2420 | for (idx = 0; idx < RDMA_WRID_MAX; idx++) { | |||
2421 | rdma->wr_data[idx].control_len = 0; | |||
2422 | rdma->wr_data[idx].control_curr = NULL((void*)0); | |||
2423 | } | |||
2424 | ||||
2425 | if (rdma->host == NULL((void*)0)) { | |||
2426 | ERROR(errp, "RDMA host is not set!")do { fprintf(stderr, "RDMA ERROR: " "RDMA host is not set!" "\n" ); if (errp && (*(errp) == ((void*)0))) { error_set(errp , ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA host is not set!" ); } } while (0); | |||
2427 | rdma->error_state = -EINVAL22; | |||
2428 | return -1; | |||
2429 | } | |||
2430 | /* create CM channel */ | |||
2431 | rdma->channel = rdma_create_event_channel(); | |||
2432 | if (!rdma->channel) { | |||
2433 | ERROR(errp, "could not create rdma event channel")do { fprintf(stderr, "RDMA ERROR: " "could not create rdma event channel" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create rdma event channel" ); } } while (0); | |||
2434 | rdma->error_state = -EINVAL22; | |||
2435 | return -1; | |||
2436 | } | |||
2437 | ||||
2438 | /* create CM id */ | |||
2439 | ret = rdma_create_id(rdma->channel, &listen_id, NULL((void*)0), RDMA_PS_TCP); | |||
2440 | if (ret) { | |||
2441 | ERROR(errp, "could not create cm_id!")do { fprintf(stderr, "RDMA ERROR: " "could not create cm_id!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not create cm_id!" ); } } while (0); | |||
2442 | goto err_dest_init_create_listen_id; | |||
2443 | } | |||
2444 | ||||
2445 | snprintf(port_str, 16, "%d", rdma->port); | |||
2446 | port_str[15] = '\0'; | |||
2447 | ||||
2448 | if (rdma->host && strcmp("", rdma->host)) { | |||
2449 | struct rdma_addrinfo *e; | |||
2450 | ||||
2451 | ret = rdma_getaddrinfo(rdma->host, port_str, NULL((void*)0), &res); | |||
2452 | if (ret < 0) { | |||
2453 | ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host)do { fprintf(stderr, "RDMA ERROR: " "could not rdma_getaddrinfo address %s" "\n", rdma->host); if (errp && (*(errp) == ((void *)0))) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not rdma_getaddrinfo address %s", rdma->host); } } while (0); | |||
2454 | goto err_dest_init_bind_addr; | |||
2455 | } | |||
2456 | ||||
2457 | for (e = res; e != NULL((void*)0); e = e->ai_next) { | |||
2458 | inet_ntop(e->ai_family, | |||
2459 | &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); | |||
2460 | DPRINTF("Trying %s => %s\n", rdma->host, ip)do { } while (0); | |||
2461 | ret = rdma_bind_addr(listen_id, e->ai_dst_addr); | |||
2462 | if (!ret) { | |||
2463 | if (e->ai_family == AF_INET610) { | |||
2464 | ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs); | |||
2465 | if (ret) { | |||
2466 | continue; | |||
2467 | } | |||
2468 | } | |||
2469 | ||||
2470 | goto listen; | |||
2471 | } | |||
2472 | } | |||
2473 | ||||
2474 | ERROR(errp, "Error: could not rdma_bind_addr!")do { fprintf(stderr, "RDMA ERROR: " "Error: could not rdma_bind_addr!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Error: could not rdma_bind_addr!" ); } } while (0); | |||
2475 | goto err_dest_init_bind_addr; | |||
2476 | } else { | |||
2477 | ERROR(errp, "migration host and port not specified!")do { fprintf(stderr, "RDMA ERROR: " "migration host and port not specified!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "migration host and port not specified!" ); } } while (0); | |||
2478 | ret = -EINVAL22; | |||
2479 | goto err_dest_init_bind_addr; | |||
2480 | } | |||
2481 | listen: | |||
2482 | ||||
2483 | rdma->listen_id = listen_id; | |||
2484 | qemu_rdma_dump_gid("dest_init", listen_id); | |||
2485 | return 0; | |||
2486 | ||||
2487 | err_dest_init_bind_addr: | |||
2488 | rdma_destroy_id(listen_id); | |||
2489 | err_dest_init_create_listen_id: | |||
2490 | rdma_destroy_event_channel(rdma->channel); | |||
2491 | rdma->channel = NULL((void*)0); | |||
2492 | rdma->error_state = ret; | |||
2493 | return ret; | |||
2494 | ||||
2495 | } | |||
2496 | ||||
2497 | static void *qemu_rdma_data_init(const char *host_port, Error **errp) | |||
2498 | { | |||
2499 | RDMAContext *rdma = NULL((void*)0); | |||
2500 | InetSocketAddress *addr; | |||
2501 | ||||
2502 | if (host_port) { | |||
2503 | rdma = g_malloc0(sizeof(RDMAContext)); | |||
2504 | memset(rdma, 0, sizeof(RDMAContext)); | |||
2505 | rdma->current_index = -1; | |||
2506 | rdma->current_chunk = -1; | |||
2507 | ||||
2508 | addr = inet_parse(host_port, NULL((void*)0)); | |||
2509 | if (addr != NULL((void*)0)) { | |||
2510 | rdma->port = atoi(addr->port); | |||
2511 | rdma->host = g_strdup(addr->host); | |||
2512 | } else { | |||
2513 | ERROR(errp, "bad RDMA migration address '%s'", host_port)do { fprintf(stderr, "RDMA ERROR: " "bad RDMA migration address '%s'" "\n", host_port); if (errp && (*(errp) == ((void*)0) )) { error_set(errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "bad RDMA migration address '%s'", host_port); } } while (0); | |||
2514 | g_free(rdma); | |||
2515 | return NULL((void*)0); | |||
2516 | } | |||
2517 | } | |||
2518 | ||||
2519 | return rdma; | |||
2520 | } | |||
2521 | ||||
2522 | /* | |||
2523 | * QEMUFile interface to the control channel. | |||
2524 | * SEND messages for control only. | |||
2525 | * pc.ram is handled with regular RDMA messages. | |||
2526 | */ | |||
2527 | static int qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, | |||
2528 | int64_t pos, int size) | |||
2529 | { | |||
2530 | QEMUFileRDMA *r = opaque; | |||
2531 | QEMUFile *f = r->file; | |||
2532 | RDMAContext *rdma = r->rdma; | |||
2533 | size_t remaining = size; | |||
2534 | uint8_t * data = (void *) buf; | |||
2535 | int ret; | |||
2536 | ||||
2537 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
2538 | ||||
2539 | /* | |||
2540 | * Push out any writes that | |||
2541 | * we're queued up for pc.ram. | |||
2542 | */ | |||
2543 | ret = qemu_rdma_write_flush(f, rdma); | |||
2544 | if (ret < 0) { | |||
2545 | rdma->error_state = ret; | |||
2546 | return ret; | |||
2547 | } | |||
2548 | ||||
2549 | while (remaining) { | |||
2550 | RDMAControlHeader head; | |||
2551 | ||||
2552 | r->len = MIN(remaining, RDMA_SEND_INCREMENT)(((remaining) < (32768)) ? (remaining) : (32768)); | |||
2553 | remaining -= r->len; | |||
2554 | ||||
2555 | head.len = r->len; | |||
2556 | head.type = RDMA_CONTROL_QEMU_FILE; | |||
2557 | ||||
2558 | ret = qemu_rdma_exchange_send(rdma, &head, data, NULL((void*)0), NULL((void*)0), NULL((void*)0)); | |||
2559 | ||||
2560 | if (ret < 0) { | |||
2561 | rdma->error_state = ret; | |||
2562 | return ret; | |||
2563 | } | |||
2564 | ||||
2565 | data += r->len; | |||
2566 | } | |||
2567 | ||||
2568 | return size; | |||
2569 | } | |||
2570 | ||||
2571 | static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, | |||
2572 | int size, int idx) | |||
2573 | { | |||
2574 | size_t len = 0; | |||
2575 | ||||
2576 | if (rdma->wr_data[idx].control_len) { | |||
2577 | DDDPRINTF("RDMA %" PRId64 " of %d bytes already in buffer\n",do { } while (0) | |||
2578 | rdma->wr_data[idx].control_len, size)do { } while (0); | |||
2579 | ||||
2580 | len = MIN(size, rdma->wr_data[idx].control_len)(((size) < (rdma->wr_data[idx].control_len)) ? (size) : (rdma->wr_data[idx].control_len)); | |||
2581 | memcpy(buf, rdma->wr_data[idx].control_curr, len); | |||
2582 | rdma->wr_data[idx].control_curr += len; | |||
2583 | rdma->wr_data[idx].control_len -= len; | |||
2584 | } | |||
2585 | ||||
2586 | return len; | |||
2587 | } | |||
2588 | ||||
2589 | /* | |||
2590 | * QEMUFile interface to the control channel. | |||
2591 | * RDMA links don't use bytestreams, so we have to | |||
2592 | * return bytes to QEMUFile opportunistically. | |||
2593 | */ | |||
2594 | static int qemu_rdma_get_buffer(void *opaque, uint8_t *buf, | |||
2595 | int64_t pos, int size) | |||
2596 | { | |||
2597 | QEMUFileRDMA *r = opaque; | |||
2598 | RDMAContext *rdma = r->rdma; | |||
2599 | RDMAControlHeader head; | |||
2600 | int ret = 0; | |||
2601 | ||||
2602 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
2603 | ||||
2604 | /* | |||
2605 | * First, we hold on to the last SEND message we | |||
2606 | * were given and dish out the bytes until we run | |||
2607 | * out of bytes. | |||
2608 | */ | |||
2609 | r->len = qemu_rdma_fill(r->rdma, buf, size, 0); | |||
2610 | if (r->len) { | |||
2611 | return r->len; | |||
2612 | } | |||
2613 | ||||
2614 | /* | |||
2615 | * Once we run out, we block and wait for another | |||
2616 | * SEND message to arrive. | |||
2617 | */ | |||
2618 | ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); | |||
2619 | ||||
2620 | if (ret < 0) { | |||
2621 | rdma->error_state = ret; | |||
2622 | return ret; | |||
2623 | } | |||
2624 | ||||
2625 | /* | |||
2626 | * SEND was received with new bytes, now try again. | |||
2627 | */ | |||
2628 | return qemu_rdma_fill(r->rdma, buf, size, 0); | |||
2629 | } | |||
2630 | ||||
2631 | /* | |||
2632 | * Block until all the outstanding chunks have been delivered by the hardware. | |||
2633 | */ | |||
2634 | static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma) | |||
2635 | { | |||
2636 | int ret; | |||
2637 | ||||
2638 | if (qemu_rdma_write_flush(f, rdma) < 0) { | |||
2639 | return -EIO5; | |||
2640 | } | |||
2641 | ||||
2642 | while (rdma->nb_sent) { | |||
2643 | ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL((void*)0)); | |||
2644 | if (ret < 0) { | |||
2645 | fprintf(stderrstderr, "rdma migration: complete polling error!\n"); | |||
2646 | return -EIO5; | |||
2647 | } | |||
2648 | } | |||
2649 | ||||
2650 | qemu_rdma_unregister_waiting(rdma); | |||
2651 | ||||
2652 | return 0; | |||
2653 | } | |||
2654 | ||||
2655 | static int qemu_rdma_close(void *opaque) | |||
2656 | { | |||
2657 | DPRINTF("Shutting down connection.\n")do { } while (0); | |||
2658 | QEMUFileRDMA *r = opaque; | |||
2659 | if (r->rdma) { | |||
2660 | qemu_rdma_cleanup(r->rdma); | |||
2661 | g_free(r->rdma); | |||
2662 | } | |||
2663 | g_free(r); | |||
2664 | return 0; | |||
2665 | } | |||
2666 | ||||
2667 | /* | |||
2668 | * Parameters: | |||
2669 | * @offset == 0 : | |||
2670 | * This means that 'block_offset' is a full virtual address that does not | |||
2671 | * belong to a RAMBlock of the virtual machine and instead | |||
2672 | * represents a private malloc'd memory area that the caller wishes to | |||
2673 | * transfer. | |||
2674 | * | |||
2675 | * @offset != 0 : | |||
2676 | * Offset is an offset to be added to block_offset and used | |||
2677 | * to also lookup the corresponding RAMBlock. | |||
2678 | * | |||
2679 | * @size > 0 : | |||
2680 | * Initiate an transfer this size. | |||
2681 | * | |||
2682 | * @size == 0 : | |||
2683 | * A 'hint' or 'advice' that means that we wish to speculatively | |||
2684 | * and asynchronously unregister this memory. In this case, there is no | |||
2685 | * guarantee that the unregister will actually happen, for example, | |||
2686 | * if the memory is being actively transmitted. Additionally, the memory | |||
2687 | * may be re-registered at any future time if a write within the same | |||
2688 | * chunk was requested again, even if you attempted to unregister it | |||
2689 | * here. | |||
2690 | * | |||
2691 | * @size < 0 : TODO, not yet supported | |||
2692 | * Unregister the memory NOW. This means that the caller does not | |||
2693 | * expect there to be any future RDMA transfers and we just want to clean | |||
2694 | * things up. This is used in case the upper layer owns the memory and | |||
2695 | * cannot wait for qemu_fclose() to occur. | |||
2696 | * | |||
2697 | * @bytes_sent : User-specificed pointer to indicate how many bytes were | |||
2698 | * sent. Usually, this will not be more than a few bytes of | |||
2699 | * the protocol because most transfers are sent asynchronously. | |||
2700 | */ | |||
2701 | static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, | |||
2702 | ram_addr_t block_offset, ram_addr_t offset, | |||
2703 | size_t size, int *bytes_sent) | |||
2704 | { | |||
2705 | QEMUFileRDMA *rfile = opaque; | |||
2706 | RDMAContext *rdma = rfile->rdma; | |||
2707 | int ret; | |||
2708 | ||||
2709 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
2710 | ||||
2711 | qemu_fflush(f); | |||
2712 | ||||
2713 | if (size > 0) { | |||
2714 | /* | |||
2715 | * Add this page to the current 'chunk'. If the chunk | |||
2716 | * is full, or the page doen't belong to the current chunk, | |||
2717 | * an actual RDMA write will occur and a new chunk will be formed. | |||
2718 | */ | |||
2719 | ret = qemu_rdma_write(f, rdma, block_offset, offset, size); | |||
2720 | if (ret < 0) { | |||
2721 | fprintf(stderrstderr, "rdma migration: write error! %d\n", ret); | |||
2722 | goto err; | |||
2723 | } | |||
2724 | ||||
2725 | /* | |||
2726 | * We always return 1 bytes because the RDMA | |||
2727 | * protocol is completely asynchronous. We do not yet know | |||
2728 | * whether an identified chunk is zero or not because we're | |||
2729 | * waiting for other pages to potentially be merged with | |||
2730 | * the current chunk. So, we have to call qemu_update_position() | |||
2731 | * later on when the actual write occurs. | |||
2732 | */ | |||
2733 | if (bytes_sent) { | |||
2734 | *bytes_sent = 1; | |||
2735 | } | |||
2736 | } else { | |||
2737 | uint64_t index, chunk; | |||
2738 | ||||
2739 | /* TODO: Change QEMUFileOps prototype to be signed: size_t => long | |||
2740 | if (size < 0) { | |||
2741 | ret = qemu_rdma_drain_cq(f, rdma); | |||
2742 | if (ret < 0) { | |||
2743 | fprintf(stderr, "rdma: failed to synchronously drain" | |||
2744 | " completion queue before unregistration.\n"); | |||
2745 | goto err; | |||
2746 | } | |||
2747 | } | |||
2748 | */ | |||
2749 | ||||
2750 | ret = qemu_rdma_search_ram_block(rdma, block_offset, | |||
2751 | offset, size, &index, &chunk); | |||
2752 | ||||
2753 | if (ret) { | |||
2754 | fprintf(stderrstderr, "ram block search failed\n"); | |||
2755 | goto err; | |||
2756 | } | |||
2757 | ||||
2758 | qemu_rdma_signal_unregister(rdma, index, chunk, 0); | |||
2759 | ||||
2760 | /* | |||
2761 | * TODO: Synchronous, guaranteed unregistration (should not occur during | |||
2762 | * fast-path). Otherwise, unregisters will process on the next call to | |||
2763 | * qemu_rdma_drain_cq() | |||
2764 | if (size < 0) { | |||
2765 | qemu_rdma_unregister_waiting(rdma); | |||
2766 | } | |||
2767 | */ | |||
2768 | } | |||
2769 | ||||
2770 | /* | |||
2771 | * Drain the Completion Queue if possible, but do not block, | |||
2772 | * just poll. | |||
2773 | * | |||
2774 | * If nothing to poll, the end of the iteration will do this | |||
2775 | * again to make sure we don't overflow the request queue. | |||
2776 | */ | |||
2777 | while (1) { | |||
2778 | uint64_t wr_id, wr_id_in; | |||
2779 | int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL((void*)0)); | |||
2780 | if (ret < 0) { | |||
2781 | fprintf(stderrstderr, "rdma migration: polling error! %d\n", ret); | |||
2782 | goto err; | |||
2783 | } | |||
2784 | ||||
2785 | wr_id = wr_id_in & RDMA_WRID_TYPE_MASK((1UL << 16UL) - 1UL); | |||
2786 | ||||
2787 | if (wr_id == RDMA_WRID_NONE) { | |||
2788 | break; | |||
2789 | } | |||
2790 | } | |||
2791 | ||||
2792 | return RAM_SAVE_CONTROL_DELAYED-2000; | |||
2793 | err: | |||
2794 | rdma->error_state = ret; | |||
2795 | return ret; | |||
2796 | } | |||
2797 | ||||
2798 | static int qemu_rdma_accept(RDMAContext *rdma) | |||
2799 | { | |||
2800 | RDMACapabilities cap; | |||
2801 | struct rdma_conn_param conn_param = { | |||
2802 | .responder_resources = 2, | |||
2803 | .private_data = &cap, | |||
2804 | .private_data_len = sizeof(cap), | |||
2805 | }; | |||
2806 | struct rdma_cm_event *cm_event; | |||
2807 | struct ibv_context *verbs; | |||
2808 | int ret = -EINVAL22; | |||
2809 | int idx; | |||
2810 | ||||
2811 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
2812 | if (ret) { | |||
2813 | goto err_rdma_dest_wait; | |||
2814 | } | |||
2815 | ||||
2816 | if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { | |||
2817 | rdma_ack_cm_event(cm_event); | |||
2818 | goto err_rdma_dest_wait; | |||
2819 | } | |||
2820 | ||||
2821 | memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); | |||
2822 | ||||
2823 | network_to_caps(&cap); | |||
2824 | ||||
2825 | if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT1) { | |||
2826 | fprintf(stderrstderr, "Unknown source RDMA version: %d, bailing...\n", | |||
2827 | cap.version); | |||
2828 | rdma_ack_cm_event(cm_event); | |||
2829 | goto err_rdma_dest_wait; | |||
2830 | } | |||
2831 | ||||
2832 | /* | |||
2833 | * Respond with only the capabilities this version of QEMU knows about. | |||
2834 | */ | |||
2835 | cap.flags &= known_capabilities; | |||
2836 | ||||
2837 | /* | |||
2838 | * Enable the ones that we do know about. | |||
2839 | * Add other checks here as new ones are introduced. | |||
2840 | */ | |||
2841 | if (cap.flags & RDMA_CAPABILITY_PIN_ALL0x01) { | |||
2842 | rdma->pin_all = true1; | |||
2843 | } | |||
2844 | ||||
2845 | rdma->cm_id = cm_event->id; | |||
2846 | verbs = cm_event->id->verbs; | |||
2847 | ||||
2848 | rdma_ack_cm_event(cm_event); | |||
2849 | ||||
2850 | DPRINTF("Memory pin all: %s\n", rdma->pin_all ? "enabled" : "disabled")do { } while (0); | |||
2851 | ||||
2852 | caps_to_network(&cap); | |||
2853 | ||||
2854 | DPRINTF("verbs context after listen: %p\n", verbs)do { } while (0); | |||
2855 | ||||
2856 | if (!rdma->verbs) { | |||
2857 | rdma->verbs = verbs; | |||
2858 | } else if (rdma->verbs != verbs) { | |||
2859 | fprintf(stderrstderr, "ibv context not matching %p, %p!\n", | |||
2860 | rdma->verbs, verbs); | |||
2861 | goto err_rdma_dest_wait; | |||
2862 | } | |||
2863 | ||||
2864 | qemu_rdma_dump_id("dest_init", verbs); | |||
2865 | ||||
2866 | ret = qemu_rdma_alloc_pd_cq(rdma); | |||
2867 | if (ret) { | |||
2868 | fprintf(stderrstderr, "rdma migration: error allocating pd and cq!\n"); | |||
2869 | goto err_rdma_dest_wait; | |||
2870 | } | |||
2871 | ||||
2872 | ret = qemu_rdma_alloc_qp(rdma); | |||
2873 | if (ret) { | |||
2874 | fprintf(stderrstderr, "rdma migration: error allocating qp!\n"); | |||
2875 | goto err_rdma_dest_wait; | |||
2876 | } | |||
2877 | ||||
2878 | ret = qemu_rdma_init_ram_blocks(rdma); | |||
2879 | if (ret) { | |||
2880 | fprintf(stderrstderr, "rdma migration: error initializing ram blocks!\n"); | |||
2881 | goto err_rdma_dest_wait; | |||
2882 | } | |||
2883 | ||||
2884 | for (idx = 0; idx < RDMA_WRID_MAX; idx++) { | |||
2885 | ret = qemu_rdma_reg_control(rdma, idx); | |||
2886 | if (ret) { | |||
2887 | fprintf(stderrstderr, "rdma: error registering %d control!\n", idx); | |||
2888 | goto err_rdma_dest_wait; | |||
2889 | } | |||
2890 | } | |||
2891 | ||||
2892 | qemu_set_fd_handler2(rdma->channel->fd, NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)); | |||
2893 | ||||
2894 | ret = rdma_accept(rdma->cm_id, &conn_param); | |||
2895 | if (ret) { | |||
2896 | fprintf(stderrstderr, "rdma_accept returns %d!\n", ret); | |||
2897 | goto err_rdma_dest_wait; | |||
2898 | } | |||
2899 | ||||
2900 | ret = rdma_get_cm_event(rdma->channel, &cm_event); | |||
2901 | if (ret) { | |||
2902 | fprintf(stderrstderr, "rdma_accept get_cm_event failed %d!\n", ret); | |||
2903 | goto err_rdma_dest_wait; | |||
2904 | } | |||
2905 | ||||
2906 | if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { | |||
2907 | fprintf(stderrstderr, "rdma_accept not event established!\n"); | |||
2908 | rdma_ack_cm_event(cm_event); | |||
2909 | goto err_rdma_dest_wait; | |||
2910 | } | |||
2911 | ||||
2912 | rdma_ack_cm_event(cm_event); | |||
2913 | rdma->connected = true1; | |||
2914 | ||||
2915 | ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); | |||
2916 | if (ret) { | |||
2917 | fprintf(stderrstderr, "rdma migration: error posting second control recv!\n"); | |||
2918 | goto err_rdma_dest_wait; | |||
2919 | } | |||
2920 | ||||
2921 | qemu_rdma_dump_gid("dest_connect", rdma->cm_id); | |||
2922 | ||||
2923 | return 0; | |||
2924 | ||||
2925 | err_rdma_dest_wait: | |||
2926 | rdma->error_state = ret; | |||
2927 | qemu_rdma_cleanup(rdma); | |||
2928 | return ret; | |||
2929 | } | |||
2930 | ||||
2931 | /* | |||
2932 | * During each iteration of the migration, we listen for instructions | |||
2933 | * by the source VM to perform dynamic page registrations before they | |||
2934 | * can perform RDMA operations. | |||
2935 | * | |||
2936 | * We respond with the 'rkey'. | |||
2937 | * | |||
2938 | * Keep doing this until the source tells us to stop. | |||
2939 | */ | |||
2940 | static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque, | |||
2941 | uint64_t flags) | |||
2942 | { | |||
2943 | RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), | |||
2944 | .type = RDMA_CONTROL_REGISTER_RESULT, | |||
2945 | .repeat = 0, | |||
2946 | }; | |||
2947 | RDMAControlHeader unreg_resp = { .len = 0, | |||
2948 | .type = RDMA_CONTROL_UNREGISTER_FINISHED, | |||
2949 | .repeat = 0, | |||
2950 | }; | |||
2951 | RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, | |||
2952 | .repeat = 1 }; | |||
2953 | QEMUFileRDMA *rfile = opaque; | |||
2954 | RDMAContext *rdma = rfile->rdma; | |||
2955 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
2956 | RDMAControlHeader head; | |||
2957 | RDMARegister *reg, *registers; | |||
2958 | RDMACompress *comp; | |||
2959 | RDMARegisterResult *reg_result; | |||
2960 | static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096]; | |||
2961 | RDMALocalBlock *block; | |||
2962 | void *host_addr; | |||
2963 | int ret = 0; | |||
2964 | int idx = 0; | |||
2965 | int count = 0; | |||
2966 | int i = 0; | |||
2967 | ||||
2968 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
2969 | ||||
2970 | do { | |||
2971 | DDDPRINTF("Waiting for next request %" PRIu64 "...\n", flags)do { } while (0); | |||
2972 | ||||
2973 | ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); | |||
2974 | ||||
2975 | if (ret < 0) { | |||
2976 | break; | |||
2977 | } | |||
2978 | ||||
2979 | if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE4096) { | |||
2980 | fprintf(stderrstderr, "rdma: Too many requests in this message (%d)." | |||
2981 | "Bailing.\n", head.repeat); | |||
2982 | ret = -EIO5; | |||
2983 | break; | |||
2984 | } | |||
2985 | ||||
2986 | switch (head.type) { | |||
2987 | case RDMA_CONTROL_COMPRESS: | |||
2988 | comp = (RDMACompress *) rdma->wr_data[idx].control_curr; | |||
2989 | network_to_compress(comp); | |||
2990 | ||||
2991 | DDPRINTF("Zapping zero chunk: %" PRId64do { } while (0) | |||
2992 | " bytes, index %d, offset %" PRId64 "\n",do { } while (0) | |||
2993 | comp->length, comp->block_idx, comp->offset)do { } while (0); | |||
2994 | block = &(rdma->local_ram_blocks.block[comp->block_idx]); | |||
2995 | ||||
2996 | host_addr = block->local_host_addr + | |||
2997 | (comp->offset - block->offset); | |||
2998 | ||||
2999 | ram_handle_compressed(host_addr, comp->value, comp->length); | |||
3000 | break; | |||
3001 | ||||
3002 | case RDMA_CONTROL_REGISTER_FINISHED: | |||
3003 | DDDPRINTF("Current registrations complete.\n")do { } while (0); | |||
3004 | goto out; | |||
3005 | ||||
3006 | case RDMA_CONTROL_RAM_BLOCKS_REQUEST: | |||
3007 | DPRINTF("Initial setup info requested.\n")do { } while (0); | |||
3008 | ||||
3009 | if (rdma->pin_all) { | |||
3010 | ret = qemu_rdma_reg_whole_ram_blocks(rdma); | |||
3011 | if (ret) { | |||
3012 | fprintf(stderrstderr, "rdma migration: error dest " | |||
3013 | "registering ram blocks!\n"); | |||
3014 | goto out; | |||
3015 | } | |||
3016 | } | |||
3017 | ||||
3018 | /* | |||
3019 | * Dest uses this to prepare to transmit the RAMBlock descriptions | |||
3020 | * to the source VM after connection setup. | |||
3021 | * Both sides use the "remote" structure to communicate and update | |||
3022 | * their "local" descriptions with what was sent. | |||
3023 | */ | |||
3024 | for (i = 0; i < local->nb_blocks; i++) { | |||
3025 | rdma->block[i].remote_host_addr = | |||
3026 | (uint64_t)(local->block[i].local_host_addr); | |||
3027 | ||||
3028 | if (rdma->pin_all) { | |||
3029 | rdma->block[i].remote_rkey = local->block[i].mr->rkey; | |||
3030 | } | |||
3031 | ||||
3032 | rdma->block[i].offset = local->block[i].offset; | |||
3033 | rdma->block[i].length = local->block[i].length; | |||
3034 | ||||
3035 | remote_block_to_network(&rdma->block[i]); | |||
3036 | } | |||
3037 | ||||
3038 | blocks.len = rdma->local_ram_blocks.nb_blocks | |||
3039 | * sizeof(RDMARemoteBlock); | |||
3040 | ||||
3041 | ||||
3042 | ret = qemu_rdma_post_send_control(rdma, | |||
3043 | (uint8_t *) rdma->block, &blocks); | |||
3044 | ||||
3045 | if (ret < 0) { | |||
3046 | fprintf(stderrstderr, "rdma migration: error sending remote info!\n"); | |||
3047 | goto out; | |||
3048 | } | |||
3049 | ||||
3050 | break; | |||
3051 | case RDMA_CONTROL_REGISTER_REQUEST: | |||
3052 | DDPRINTF("There are %d registration requests\n", head.repeat)do { } while (0); | |||
3053 | ||||
3054 | reg_resp.repeat = head.repeat; | |||
3055 | registers = (RDMARegister *) rdma->wr_data[idx].control_curr; | |||
3056 | ||||
3057 | for (count = 0; count < head.repeat; count++) { | |||
3058 | uint64_t chunk; | |||
3059 | uint8_t *chunk_start, *chunk_end; | |||
3060 | ||||
3061 | reg = ®isters[count]; | |||
3062 | network_to_register(reg); | |||
3063 | ||||
3064 | reg_result = &results[count]; | |||
3065 | ||||
3066 | DDPRINTF("Registration request (%d): index %d, current_addr %"do { } while (0) | |||
3067 | PRIu64 " chunks: %" PRIu64 "\n", count,do { } while (0) | |||
3068 | reg->current_index, reg->key.current_addr, reg->chunks)do { } while (0); | |||
3069 | ||||
3070 | block = &(rdma->local_ram_blocks.block[reg->current_index]); | |||
3071 | if (block->is_ram_block) { | |||
3072 | host_addr = (block->local_host_addr + | |||
3073 | (reg->key.current_addr - block->offset)); | |||
3074 | chunk = ram_chunk_index(block->local_host_addr, | |||
3075 | (uint8_t *) host_addr); | |||
3076 | } else { | |||
3077 | chunk = reg->key.chunk; | |||
3078 | host_addr = block->local_host_addr + | |||
3079 | (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT20)); | |||
3080 | } | |||
3081 | chunk_start = ram_chunk_start(block, chunk); | |||
3082 | chunk_end = ram_chunk_end(block, chunk + reg->chunks); | |||
3083 | if (qemu_rdma_register_and_get_keys(rdma, block, | |||
3084 | (uint8_t *)host_addr, NULL((void*)0), ®_result->rkey, | |||
3085 | chunk, chunk_start, chunk_end)) { | |||
3086 | fprintf(stderrstderr, "cannot get rkey!\n"); | |||
3087 | ret = -EINVAL22; | |||
3088 | goto out; | |||
3089 | } | |||
3090 | ||||
3091 | reg_result->host_addr = (uint64_t) block->local_host_addr; | |||
3092 | ||||
3093 | DDPRINTF("Registered rkey for this request: %x\n",do { } while (0) | |||
3094 | reg_result->rkey)do { } while (0); | |||
3095 | ||||
3096 | result_to_network(reg_result); | |||
3097 | } | |||
3098 | ||||
3099 | ret = qemu_rdma_post_send_control(rdma, | |||
3100 | (uint8_t *) results, ®_resp); | |||
3101 | ||||
3102 | if (ret < 0) { | |||
3103 | fprintf(stderrstderr, "Failed to send control buffer!\n"); | |||
3104 | goto out; | |||
3105 | } | |||
3106 | break; | |||
3107 | case RDMA_CONTROL_UNREGISTER_REQUEST: | |||
3108 | DDPRINTF("There are %d unregistration requests\n", head.repeat)do { } while (0); | |||
3109 | unreg_resp.repeat = head.repeat; | |||
3110 | registers = (RDMARegister *) rdma->wr_data[idx].control_curr; | |||
3111 | ||||
3112 | for (count = 0; count < head.repeat; count++) { | |||
3113 | reg = ®isters[count]; | |||
3114 | network_to_register(reg); | |||
3115 | ||||
3116 | DDPRINTF("Unregistration request (%d): "do { } while (0) | |||
3117 | " index %d, chunk %" PRIu64 "\n",do { } while (0) | |||
3118 | count, reg->current_index, reg->key.chunk)do { } while (0); | |||
3119 | ||||
3120 | block = &(rdma->local_ram_blocks.block[reg->current_index]); | |||
3121 | ||||
3122 | ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); | |||
3123 | block->pmr[reg->key.chunk] = NULL((void*)0); | |||
3124 | ||||
3125 | if (ret != 0) { | |||
3126 | perror("rdma unregistration chunk failed"); | |||
3127 | ret = -ret; | |||
3128 | goto out; | |||
3129 | } | |||
3130 | ||||
3131 | rdma->total_registrations--; | |||
3132 | ||||
3133 | DDPRINTF("Unregistered chunk %" PRIu64 " successfully.\n",do { } while (0) | |||
3134 | reg->key.chunk)do { } while (0); | |||
3135 | } | |||
3136 | ||||
3137 | ret = qemu_rdma_post_send_control(rdma, NULL((void*)0), &unreg_resp); | |||
3138 | ||||
3139 | if (ret < 0) { | |||
3140 | fprintf(stderrstderr, "Failed to send control buffer!\n"); | |||
3141 | goto out; | |||
3142 | } | |||
3143 | break; | |||
3144 | case RDMA_CONTROL_REGISTER_RESULT: | |||
3145 | fprintf(stderrstderr, "Invalid RESULT message at dest.\n"); | |||
3146 | ret = -EIO5; | |||
3147 | goto out; | |||
3148 | default: | |||
3149 | fprintf(stderrstderr, "Unknown control message %s\n", | |||
3150 | control_desc[head.type]); | |||
3151 | ret = -EIO5; | |||
3152 | goto out; | |||
3153 | } | |||
3154 | } while (1); | |||
3155 | out: | |||
3156 | if (ret < 0) { | |||
3157 | rdma->error_state = ret; | |||
3158 | } | |||
3159 | return ret; | |||
3160 | } | |||
3161 | ||||
3162 | static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, | |||
3163 | uint64_t flags) | |||
3164 | { | |||
3165 | QEMUFileRDMA *rfile = opaque; | |||
3166 | RDMAContext *rdma = rfile->rdma; | |||
3167 | ||||
3168 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
3169 | ||||
3170 | DDDPRINTF("start section: %" PRIu64 "\n", flags)do { } while (0); | |||
3171 | qemu_put_be64(f, RAM_SAVE_FLAG_HOOK0x80); | |||
3172 | qemu_fflush(f); | |||
3173 | ||||
3174 | return 0; | |||
3175 | } | |||
3176 | ||||
3177 | /* | |||
3178 | * Inform dest that dynamic registrations are done for now. | |||
3179 | * First, flush writes, if any. | |||
3180 | */ | |||
3181 | static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, | |||
3182 | uint64_t flags) | |||
3183 | { | |||
3184 | Error *local_err = NULL((void*)0), **errp = &local_err; | |||
3185 | QEMUFileRDMA *rfile = opaque; | |||
3186 | RDMAContext *rdma = rfile->rdma; | |||
3187 | RDMAControlHeader head = { .len = 0, .repeat = 1 }; | |||
3188 | int ret = 0; | |||
3189 | ||||
3190 | CHECK_ERROR_STATE()do { if (rdma->error_state) { if (!rdma->error_reported ) { fprintf(stderr, "RDMA is in an error state waiting migration" " to abort!\n"); rdma->error_reported = 1; } return rdma-> error_state; } } while (0);; | |||
3191 | ||||
3192 | qemu_fflush(f); | |||
3193 | ret = qemu_rdma_drain_cq(f, rdma); | |||
3194 | ||||
3195 | if (ret < 0) { | |||
3196 | goto err; | |||
3197 | } | |||
3198 | ||||
3199 | if (flags == RAM_CONTROL_SETUP0) { | |||
3200 | RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; | |||
3201 | RDMALocalBlocks *local = &rdma->local_ram_blocks; | |||
3202 | int reg_result_idx, i, j, nb_remote_blocks; | |||
3203 | ||||
3204 | head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; | |||
3205 | DPRINTF("Sending registration setup for ram blocks...\n")do { } while (0); | |||
3206 | ||||
3207 | /* | |||
3208 | * Make sure that we parallelize the pinning on both sides. | |||
3209 | * For very large guests, doing this serially takes a really | |||
3210 | * long time, so we have to 'interleave' the pinning locally | |||
3211 | * with the control messages by performing the pinning on this | |||
3212 | * side before we receive the control response from the other | |||
3213 | * side that the pinning has completed. | |||
3214 | */ | |||
3215 | ret = qemu_rdma_exchange_send(rdma, &head, NULL((void*)0), &resp, | |||
3216 | ®_result_idx, rdma->pin_all ? | |||
3217 | qemu_rdma_reg_whole_ram_blocks : NULL((void*)0)); | |||
3218 | if (ret < 0) { | |||
3219 | ERROR(errp, "receiving remote info!")do { fprintf(stderr, "RDMA ERROR: " "receiving remote info!" "\n" ); if (errp && (*(errp) == ((void*)0))) { error_set(errp , ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "receiving remote info!" ); } } while (0); | |||
3220 | return ret; | |||
3221 | } | |||
3222 | ||||
3223 | nb_remote_blocks = resp.len / sizeof(RDMARemoteBlock); | |||
3224 | ||||
3225 | /* | |||
3226 | * The protocol uses two different sets of rkeys (mutually exclusive): | |||
3227 | * 1. One key to represent the virtual address of the entire ram block. | |||
3228 | * (dynamic chunk registration disabled - pin everything with one rkey.) | |||
3229 | * 2. One to represent individual chunks within a ram block. | |||
3230 | * (dynamic chunk registration enabled - pin individual chunks.) | |||
3231 | * | |||
3232 | * Once the capability is successfully negotiated, the destination transmits | |||
3233 | * the keys to use (or sends them later) including the virtual addresses | |||
3234 | * and then propagates the remote ram block descriptions to his local copy. | |||
3235 | */ | |||
3236 | ||||
3237 | if (local->nb_blocks != nb_remote_blocks) { | |||
3238 | ERROR(errp, "ram blocks mismatch #1! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3239 | "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3240 | "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #1! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0); | |||
3241 | return -EINVAL22; | |||
3242 | } | |||
3243 | ||||
3244 | qemu_rdma_move_header(rdma, reg_result_idx, &resp); | |||
3245 | memcpy(rdma->block, | |||
3246 | rdma->wr_data[reg_result_idx].control_curr, resp.len); | |||
3247 | for (i = 0; i < nb_remote_blocks; i++) { | |||
3248 | network_to_remote_block(&rdma->block[i]); | |||
3249 | ||||
3250 | /* search local ram blocks */ | |||
3251 | for (j = 0; j < local->nb_blocks; j++) { | |||
3252 | if (rdma->block[i].offset != local->block[j].offset) { | |||
3253 | continue; | |||
3254 | } | |||
3255 | ||||
3256 | if (rdma->block[i].length != local->block[j].length) { | |||
3257 | ERROR(errp, "ram blocks mismatch #2! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3258 | "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3259 | "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #2! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0); | |||
3260 | return -EINVAL22; | |||
3261 | } | |||
3262 | local->block[j].remote_host_addr = | |||
3263 | rdma->block[i].remote_host_addr; | |||
3264 | local->block[j].remote_rkey = rdma->block[i].remote_rkey; | |||
3265 | break; | |||
3266 | } | |||
3267 | ||||
3268 | if (j >= local->nb_blocks) { | |||
3269 | ERROR(errp, "ram blocks mismatch #3! "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3270 | "Your QEMU command line parameters are probably "do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0) | |||
3271 | "not identical on both the source and destination.")do { fprintf(stderr, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "ram blocks mismatch #3! " "Your QEMU command line parameters are probably " "not identical on both the source and destination." ); } } while (0); | |||
3272 | return -EINVAL22; | |||
3273 | } | |||
3274 | } | |||
3275 | } | |||
3276 | ||||
3277 | DDDPRINTF("Sending registration finish %" PRIu64 "...\n", flags)do { } while (0); | |||
3278 | ||||
3279 | head.type = RDMA_CONTROL_REGISTER_FINISHED; | |||
3280 | ret = qemu_rdma_exchange_send(rdma, &head, NULL((void*)0), NULL((void*)0), NULL((void*)0), NULL((void*)0)); | |||
3281 | ||||
3282 | if (ret < 0) { | |||
3283 | goto err; | |||
3284 | } | |||
3285 | ||||
3286 | return 0; | |||
3287 | err: | |||
3288 | rdma->error_state = ret; | |||
3289 | return ret; | |||
3290 | } | |||
3291 | ||||
3292 | static int qemu_rdma_get_fd(void *opaque) | |||
3293 | { | |||
3294 | QEMUFileRDMA *rfile = opaque; | |||
3295 | RDMAContext *rdma = rfile->rdma; | |||
3296 | ||||
3297 | return rdma->comp_channel->fd; | |||
3298 | } | |||
3299 | ||||
3300 | const QEMUFileOps rdma_read_ops = { | |||
3301 | .get_buffer = qemu_rdma_get_buffer, | |||
3302 | .get_fd = qemu_rdma_get_fd, | |||
3303 | .close = qemu_rdma_close, | |||
3304 | .hook_ram_load = qemu_rdma_registration_handle, | |||
3305 | }; | |||
3306 | ||||
3307 | const QEMUFileOps rdma_write_ops = { | |||
3308 | .put_buffer = qemu_rdma_put_buffer, | |||
3309 | .close = qemu_rdma_close, | |||
3310 | .before_ram_iterate = qemu_rdma_registration_start, | |||
3311 | .after_ram_iterate = qemu_rdma_registration_stop, | |||
3312 | .save_page = qemu_rdma_save_page, | |||
3313 | }; | |||
3314 | ||||
3315 | static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) | |||
3316 | { | |||
3317 | QEMUFileRDMA *r = g_malloc0(sizeof(QEMUFileRDMA)); | |||
3318 | ||||
3319 | if (qemu_file_mode_is_not_valid(mode)) { | |||
3320 | return NULL((void*)0); | |||
3321 | } | |||
3322 | ||||
3323 | r->rdma = rdma; | |||
3324 | ||||
3325 | if (mode[0] == 'w') { | |||
3326 | r->file = qemu_fopen_ops(r, &rdma_write_ops); | |||
3327 | } else { | |||
3328 | r->file = qemu_fopen_ops(r, &rdma_read_ops); | |||
3329 | } | |||
3330 | ||||
3331 | return r->file; | |||
3332 | } | |||
3333 | ||||
3334 | static void rdma_accept_incoming_migration(void *opaque) | |||
3335 | { | |||
3336 | RDMAContext *rdma = opaque; | |||
3337 | int ret; | |||
3338 | QEMUFile *f; | |||
3339 | Error *local_err = NULL((void*)0), **errp = &local_err; | |||
3340 | ||||
3341 | DPRINTF("Accepting rdma connection...\n")do { } while (0); | |||
3342 | ret = qemu_rdma_accept(rdma); | |||
3343 | ||||
3344 | if (ret) { | |||
3345 | ERROR(errp, "RDMA Migration initialization failed!")do { fprintf(stderr, "RDMA ERROR: " "RDMA Migration initialization failed!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "RDMA Migration initialization failed!" ); } } while (0); | |||
3346 | return; | |||
3347 | } | |||
3348 | ||||
3349 | DPRINTF("Accepted migration\n")do { } while (0); | |||
3350 | ||||
3351 | f = qemu_fopen_rdma(rdma, "rb"); | |||
3352 | if (f == NULL((void*)0)) { | |||
3353 | ERROR(errp, "could not qemu_fopen_rdma!")do { fprintf(stderr, "RDMA ERROR: " "could not qemu_fopen_rdma!" "\n"); if (errp && (*(errp) == ((void*)0))) { error_set (errp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "could not qemu_fopen_rdma!" ); } } while (0); | |||
3354 | qemu_rdma_cleanup(rdma); | |||
3355 | return; | |||
3356 | } | |||
3357 | ||||
3358 | rdma->migration_started_on_destination = 1; | |||
3359 | process_incoming_migration(f); | |||
3360 | } | |||
3361 | ||||
3362 | void rdma_start_incoming_migration(const char *host_port, Error **errp) | |||
3363 | { | |||
3364 | int ret; | |||
3365 | RDMAContext *rdma; | |||
3366 | Error *local_err = NULL((void*)0); | |||
3367 | ||||
3368 | DPRINTF("Starting RDMA-based incoming migration\n")do { } while (0); | |||
3369 | rdma = qemu_rdma_data_init(host_port, &local_err); | |||
3370 | ||||
3371 | if (rdma == NULL((void*)0)) { | |||
3372 | goto err; | |||
3373 | } | |||
3374 | ||||
3375 | ret = qemu_rdma_dest_init(rdma, &local_err); | |||
3376 | ||||
3377 | if (ret) { | |||
3378 | goto err; | |||
3379 | } | |||
3380 | ||||
3381 | DPRINTF("qemu_rdma_dest_init success\n")do { } while (0); | |||
3382 | ||||
3383 | ret = rdma_listen(rdma->listen_id, 5); | |||
3384 | ||||
3385 | if (ret) { | |||
3386 | ERROR(errp, "listening on socket!")do { fprintf(stderr, "RDMA ERROR: " "listening on socket!" "\n" ); if (errp && (*(errp) == ((void*)0))) { error_set(errp , ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "listening on socket!" ); } } while (0); | |||
3387 | goto err; | |||
3388 | } | |||
3389 | ||||
3390 | DPRINTF("rdma_listen success\n")do { } while (0); | |||
3391 | ||||
3392 | qemu_set_fd_handler2(rdma->channel->fd, NULL((void*)0), | |||
3393 | rdma_accept_incoming_migration, NULL((void*)0), | |||
3394 | (void *)(intptr_t) rdma); | |||
3395 | return; | |||
3396 | err: | |||
3397 | error_propagate(errp, local_err); | |||
3398 | g_free(rdma); | |||
3399 | } | |||
3400 | ||||
3401 | void rdma_start_outgoing_migration(void *opaque, | |||
3402 | const char *host_port, Error **errp) | |||
3403 | { | |||
3404 | MigrationState *s = opaque; | |||
3405 | Error *local_err = NULL((void*)0), **temp = &local_err; | |||
3406 | RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err); | |||
3407 | int ret = 0; | |||
3408 | ||||
3409 | if (rdma == NULL((void*)0)) { | |||
| ||||
3410 | ERROR(temp, "Failed to initialize RDMA data structures! %d", ret)do { fprintf(stderr, "RDMA ERROR: " "Failed to initialize RDMA data structures! %d" "\n", ret); if (temp && (*(temp) == ((void*)0))) { error_set (temp, ERROR_CLASS_GENERIC_ERROR, "RDMA ERROR: " "Failed to initialize RDMA data structures! %d" , ret); } } while (0); | |||
3411 | goto err; | |||
3412 | } | |||
3413 | ||||
3414 | ret = qemu_rdma_source_init(rdma, &local_err, | |||
3415 | s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]); | |||
3416 | ||||
3417 | if (ret) { | |||
3418 | goto err; | |||
3419 | } | |||
3420 | ||||
3421 | DPRINTF("qemu_rdma_source_init success\n")do { } while (0); | |||
3422 | ret = qemu_rdma_connect(rdma, &local_err); | |||
3423 | ||||
3424 | if (ret) { | |||
3425 | goto err; | |||
3426 | } | |||
3427 | ||||
3428 | DPRINTF("qemu_rdma_source_connect success\n")do { } while (0); | |||
3429 | ||||
3430 | s->file = qemu_fopen_rdma(rdma, "wb"); | |||
3431 | migrate_fd_connect(s); | |||
3432 | return; | |||
3433 | err: | |||
3434 | error_propagate(errp, local_err); | |||
3435 | g_free(rdma); | |||
3436 | migrate_fd_error(s); | |||
3437 | } |