Graphite  0.5.0
GPU-accelerated graph optimization framework
Loading...
Searching...
No Matches
schur.hpp
Go to the documentation of this file.
1
2#pragma once
3
4#include "block.hpp"
5#include "graphite/block.hpp"
6#include <algorithm>
7#include <boost/container_hash/hash.hpp>
8#include <chrono>
9#include <cstdint>
10#include <cublas_v2.h>
12#include <graphite/graph.hpp>
13#include <graphite/hessian.hpp>
16#include <graphite/stream.hpp>
17#include <iostream>
18#include <limits>
19#include <ratio>
20#include <thrust/copy.h>
21#include <thrust/execution_policy.h>
22#include <thrust/functional.h>
23#include <thrust/host_vector.h>
24#include <thrust/iterator/counting_iterator.h>
25#include <thrust/reduce.h>
26#include <thrust/scan.h>
27#include <thrust/sort.h>
28#include <thrust/system/cuda/execution_policy.h>
29#include <thrust/transform.h>
30#include <thrust/transform_reduce.h>
31#include <thrust/unique.h>
32#include <unordered_map>
33#include <vector>
34
35namespace graphite {
36
37// Triplet dimensions for operation D += A*B*C, where B is always a square
38// block.
39struct ProductDim {
40 size_t dim_a;
41 size_t dim_b;
42 size_t dim_c;
43
44 bool operator==(const ProductDim &other) const {
45 return dim_a == other.dim_a && dim_b == other.dim_b && dim_c == other.dim_c;
46 }
47};
48
50
52 __host__ __device__ size_t operator()(size_t count) const {
53 return count * (count + 1) / 2;
54 }
55};
56
58 __host__ __device__ bool operator()(const BlockCoordinates &a,
59 const BlockCoordinates &b) const {
60 if (a.col == b.col) {
61 return a.row < b.row;
62 }
63 return a.col < b.col;
64 }
65};
66
67} // namespace graphite
68
69namespace std {
70template <> struct hash<graphite::ProductDim> {
71 size_t operator()(const graphite::ProductDim &pd) const {
72 size_t seed = 0;
73 boost::hash_combine(seed, pd.dim_a);
74 boost::hash_combine(seed, pd.dim_b);
75 boost::hash_combine(seed, pd.dim_c);
76 return seed;
77 }
78};
79
80} // namespace std
81
82namespace graphite {
83
87template <typename T, typename S> class SchurComplement {
88public:
89 SchurComplement(Hessian<T, S> &H) : H(H) {
90 cublasCreate(&handle);
91 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
92 lowest_eliminated_block_col = std::numeric_limits<size_t>::max();
93 }
94 ~SchurComplement() { cublasDestroy(handle); }
95
96 // Important dimensions and offsets
97 size_t lowest_eliminated_block_col;
98 size_t pose_col_start;
99 size_t landmark_col_start;
100 size_t num_block_columns;
101 size_t pose_dim;
102 size_t landmark_dim;
103 thrust::device_vector<S> values;
104 using P = std::conditional_t<is_low_precision<S>::value, T,
105 S>; // from block Jacobi preconditioner: don't
106 // want to invert at low precision
107 // let's actually disable the low precision path for now
108 static_assert(
109 !is_low_precision<S>::value,
110 "SchurComplement does not currently support low precision types.");
111 static_assert(
112 std::is_same<T, S>::value,
113 "SchurComplement currently requires T and S to be the same type.");
114 thrust::device_vector<P>
115 diagonal_values; // stores Hll block diagonals, then overwritten in-place
116 // with Hll^(-1).
117 Hessian<T, S> &H;
118 std::unordered_map<BlockCoordinates, size_t> block_indices;
119 std::unordered_map<BlockCoordinates, size_t>
120 diagonal_indices; // block diagonal indices for Hll^(-1)
121 std::unordered_map<size_t, std::vector<BlockCoordinates>>
122 diagonal_coords_by_dim;
123
124 // Buffers for multiplying the three matrices
125 std::unordered_map<ProductDim, thrust::device_vector<ops::MulOp<S>>> mul_ops;
126 std::unordered_map<ProductDim, std::vector<ops::MulOp<S>>> h_mul_ops;
127 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
128 hpl_vec_ops;
129 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_hpl_vec_ops;
130 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
131 hplt_vec_ops;
132 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_hplt_vec_ops;
133 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
134 schur_vec_ops;
135 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_schur_vec_ops;
136 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
137 schur_vec_t_ops;
138 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>>
139 h_schur_vec_t_ops;
140 std::unordered_map<MatVecDim, thrust::device_vector<ops::BlockCopyOp>>
141 hpp_copy_ops;
142 std::unordered_map<MatVecDim, std::vector<ops::BlockCopyOp>> h_hpp_copy_ops;
143
144 // Per-block-size buffers for cublas*matinvBatched.
145 thrust::host_vector<P *> A_ptrs;
146 thrust::host_vector<P *> Ainv_ptrs;
147 std::unordered_map<size_t, thrust::device_vector<P *>> A_ptrs_device_by_dim;
148 std::unordered_map<size_t, thrust::device_vector<P *>>
149 Ainv_ptrs_device_by_dim;
150 std::unordered_map<size_t, thrust::device_vector<int>> info_by_dim;
151 std::unordered_map<size_t, thrust::device_vector<size_t>>
152 diagonal_value_offsets_by_dim;
153 std::unordered_map<size_t, thrust::device_vector<size_t>>
154 landmark_local_offsets_by_dim;
155 thrust::host_vector<size_t> h_block_col_pointers;
156 thrust::host_vector<size_t> h_block_row_indices;
157 thrust::host_vector<size_t> h_block_offsets;
158
159 // Block-CSC style metadata for Schur matrix blocks.
160 thrust::device_vector<size_t> d_col_pointers;
161 thrust::device_vector<size_t> d_row_indices;
162 thrust::device_vector<size_t> d_offsets;
163 thrust::device_vector<size_t> d_schur_offsets;
164 thrust::device_vector<size_t> scalar_to_block_map;
165
166 // Schur and temporary device vectors
167 thrust::device_vector<T> b_Schur;
168 thrust::device_vector<T> l_workspace;
169 thrust::device_vector<T> p_workspace;
170 thrust::device_vector<T> diagonal_workspace;
171
172 // For Schur structure
173 thrust::device_vector<size_t> d_pose_counts;
174 thrust::device_vector<size_t> d_pair_counts;
175 thrust::device_vector<size_t> d_pair_offsets;
176 thrust::device_vector<BlockCoordinates> d_pairs;
177 thrust::host_vector<BlockCoordinates> h_pairs;
178 thrust::device_vector<ops::SchurMulTuple> d_mul_tuples;
179 thrust::host_vector<ops::SchurMulTuple> h_mul_tuples;
180
181 // For Schur multiplication
182 std::vector<size_t> block_dims;
183
184 // For diagonal inverse multiplication
185 thrust::host_vector<size_t> h_diagonal_offsets;
186 thrust::host_vector<size_t> h_local_offsets;
187
188 // For building indices
189 thrust::host_vector<size_t> h_schur_offsets;
190 std::vector<BlockCoordinates> block_coords;
191
192 cublasHandle_t handle;
193
194 void build_structure(Graph<T, S> *graph, StreamPool &streams) {
195 h_block_col_pointers = H.get_block_col_pointers();
196 h_block_row_indices = H.get_block_row_indices();
197 h_block_offsets = H.get_block_value_offsets();
198 const auto &vertex_descriptors = graph->get_vertex_descriptors();
199
200 lowest_eliminated_block_col = graph->get_elimination_block_column();
201 pose_col_start = 0;
202 landmark_col_start = lowest_eliminated_block_col;
203 num_block_columns = graph->get_num_block_columns();
204 pose_dim = graph->get_offset_vector()[landmark_col_start];
205 landmark_dim = graph->get_hessian_dimension() - pose_dim;
206 b_Schur.resize(pose_dim);
207
208 build_schur_structure(graph, streams);
209
210 setup_Hpp_copy(graph, streams);
211
212 build_indices(graph, streams);
213
214 build_diagonal_structure(graph, streams);
215
216 setup_schur_multiplication(graph, streams);
217
218 setup_diagonal_inverse_multiply(graph, streams);
219
220 setup_Hpl_vector_multiply(graph, streams);
221
222 setup_HplT_vector_multiply(graph, streams);
223
224 setup_b_Schur_computation(graph, streams);
225 }
226
227 void update_values(Graph<T, S> *graph, StreamPool &streams) {
228 execute_Hpp_copy(graph, streams);
229
230 execute_block_diagonal_inversion(graph, streams);
231
232 execute_schur_multiplication(graph, streams);
233
234 execute_b_Schur_computation(graph, streams, b_Schur.data().get());
235 }
236
237 thrust::device_vector<T> &get_b_Schur() { return b_Schur; }
238
239 void build_indices(Graph<T, S> *graph, StreamPool &streams) {
240 (void)streams;
241 block_coords.clear();
242 block_coords.reserve(block_indices.size());
243 for (const auto &entry : block_indices) {
244 block_coords.push_back(entry.first);
245 }
246
247 csc::build_block_csc_indices(landmark_col_start, block_indices,
248 block_coords, d_col_pointers, d_row_indices,
249 d_offsets);
250
251 // Build scalar offsets for the Schur block columns (p-part only).
252 h_schur_offsets.resize(landmark_col_start + 1);
253 h_schur_offsets[0] = 0;
254 for (size_t bc = 0; bc < landmark_col_start; bc++) {
255 h_schur_offsets[bc + 1] =
256 h_schur_offsets[bc] + graph->get_variable_dimension(bc);
257 }
258 d_schur_offsets = h_schur_offsets;
259 csc::build_scalar_to_block_map(d_schur_offsets, landmark_col_start,
260 scalar_to_block_map);
261 }
262
263 template <typename I>
264 void build_csc_structure(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
265 (void)graph;
266 csc::build_scalar_csc_structure<S, I>(pose_dim, d_col_pointers,
267 d_row_indices, d_schur_offsets,
268 scalar_to_block_map, matrix);
269 }
270
271 template <typename I>
272 void update_csc_values(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
273 (void)graph;
274 csc::update_scalar_csc_values<S, I>(
275 pose_dim, values, d_col_pointers, d_row_indices, d_offsets,
276 d_schur_offsets, scalar_to_block_map, matrix);
277 }
278
279 void compute_landmark_update(Graph<T, S> *graph, StreamPool &streams, T *xl,
280 T *xp) {
281 auto stream = streams.select(0);
282 const auto &offsets = graph->get_offset_vector();
283 if (landmark_col_start >= num_block_columns) {
284 return;
285 }
286
287 T *b = graph->get_b().data().get();
288
289 // tmp_l = Hpl^T * xp
290 execute_HplT_vector_multiply(graph, streams, l_workspace.data().get(), xp);
291
292 // rhs_l = b_l - tmp_l
293 thrust::copy_n(thrust::cuda::par_nosync.on(stream), b + pose_dim,
294 landmark_dim, xl);
295 ops::axpy_async(stream, landmark_dim, xl, static_cast<T>(-1),
296 l_workspace.data().get(), xl);
297
298 // xl = Hll^(-1) * rhs_l
299 execute_diagonal_inverse_multiply(graph, streams, xl, xl);
300
301 cudaStreamSynchronize(stream);
302 }
303
304 // Optional setup for Schur block matvec: y = S*x.
305 // This is not required by direct solvers and should only be called when
306 // iterative methods need explicit Schur matvecs.
307 void setup_schur_vector_multiply(Graph<T, S> *graph, StreamPool &streams) {
308 (void)streams;
309 const auto &offsets = graph->get_offset_vector();
310
311 schur_vec_ops.clear();
312 h_schur_vec_ops.clear();
313 schur_vec_t_ops.clear();
314 h_schur_vec_t_ops.clear();
315
316 for (const auto &entry : block_indices) {
317 const auto &coord = entry.first;
318 const size_t row = coord.row;
319 const size_t col = coord.col;
320
321 if (row >= landmark_col_start || col >= landmark_col_start) {
322 continue;
323 }
324
325 const size_t row_dim = graph->get_variable_dimension(row);
326 const size_t col_dim = graph->get_variable_dimension(col);
327 const MatVecDim mvd{row_dim, col_dim};
328
329 h_schur_vec_ops[mvd].push_back(
330 ops::HplMatVecOp{entry.second, offsets[col], offsets[row]});
331
332 if (row != col) {
333 h_schur_vec_t_ops[mvd].push_back(
334 ops::HplMatVecOp{entry.second, offsets[row], offsets[col]});
335 }
336 }
337
338 for (const auto &entry : h_schur_vec_ops) {
339 schur_vec_ops[entry.first] = entry.second;
340 }
341 for (const auto &entry : h_schur_vec_t_ops) {
342 schur_vec_t_ops[entry.first] = entry.second;
343 }
344 }
345
346 // Executes y = S*x using Schur block values in upper-triangular storage.
347 void execute_schur_vector_multiply(Graph<T, S> *graph, StreamPool &streams,
348 T *vec_out, const T *vec_in) {
349 (void)graph;
350 auto stream = streams.select(0);
351
352 thrust::fill(thrust::cuda::par_nosync.on(stream), vec_out,
353 vec_out + pose_dim, static_cast<T>(0));
354
355 const S *s_values = values.data().get();
356 constexpr size_t threads_per_block = 256;
357
358 for (const auto &entry : schur_vec_ops) {
359 const auto &mvd = entry.first;
360 const auto &ops_dev = entry.second;
361 const size_t num_ops = ops_dev.size();
362 if (num_ops == 0) {
363 continue;
364 }
365
366 const size_t total_rows = num_ops * mvd.row;
367 const size_t blocks =
368 (total_rows + threads_per_block - 1) / threads_per_block;
369 ops::block_matvec_add_batched_kernel<T, S>
370 <<<blocks, threads_per_block, 0, stream>>>(
371 s_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
372 mvd.col);
373 }
374
375 for (const auto &entry : schur_vec_t_ops) {
376 const auto &mvd = entry.first;
377 const auto &ops_dev = entry.second;
378 const size_t num_ops = ops_dev.size();
379 if (num_ops == 0) {
380 continue;
381 }
382
383 const size_t total_cols = num_ops * mvd.col;
384 const size_t blocks =
385 (total_cols + threads_per_block - 1) / threads_per_block;
386 ops::block_matvec_transpose_add_batched_kernel<T, S>
387 <<<blocks, threads_per_block, 0, stream>>>(
388 s_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
389 mvd.col);
390 }
391
392 cudaStreamSynchronize(stream);
393 }
394
395private:
396 // Computes the structure of S = Hpp - Hpl*Hll^(-1)*Hpl^T
397 void build_schur_structure(Graph<T, S> *graph, StreamPool &streams) {
398 size_t num_values = 0;
399 block_indices.clear();
400 // 1. To compute the Schur matrix, we need to know which blocks are
401 // filled-in
402
403 // 1.1 First we include all the blocks in Hpp (everything less than the
404 // lowest eliminated block column)
405 for (size_t col = 0; col < landmark_col_start; col++) {
406 const size_t col_start = h_block_col_pointers[col];
407 const size_t col_end = h_block_col_pointers[col + 1];
408 for (size_t ka = col_start; ka < col_end; ka++) {
409 block_indices.emplace(BlockCoordinates{h_block_row_indices[ka], col},
410 0);
411 }
412 }
413
414 // 1.2 Next figure out which blocks are filled in due to the operation
415 // Hpl*Hll^(-1)*Hpl^T using the upper triangular Hessian.
416
417 if (landmark_col_start < num_block_columns) {
418 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
419 d_pose_counts.resize(num_landmark_cols);
420 d_pair_counts.resize(num_landmark_cols);
421 d_pair_offsets.resize(num_landmark_cols + 1);
422
423 constexpr size_t threads_per_block = 256;
424 const size_t count_blocks =
425 (num_landmark_cols + threads_per_block - 1) / threads_per_block;
426
427 ops::count_pose_rows_per_landmark_column_kernel<<<count_blocks,
428 threads_per_block>>>(
429 H.get_block_col_pointers().data().get(),
430 H.get_block_row_indices().data().get(), landmark_col_start,
431 num_block_columns, d_pose_counts.data().get());
432
433 thrust::transform(thrust::device, d_pose_counts.begin(),
434 d_pose_counts.end(), d_pair_counts.begin(),
436
437 thrust::exclusive_scan(thrust::device, d_pair_counts.begin(),
438 d_pair_counts.end(), d_pair_offsets.begin());
439
440 const size_t total_pairs =
441 thrust::reduce(thrust::device, d_pair_counts.begin(),
442 d_pair_counts.end(), size_t(0));
443 d_pair_offsets[num_landmark_cols] = total_pairs;
444
445 if (total_pairs > 0) {
446 d_pairs.resize(total_pairs);
447 ops::fill_schur_structure_pairs_kernel<<<count_blocks,
448 threads_per_block>>>(
449 H.get_block_col_pointers().data().get(),
450 H.get_block_row_indices().data().get(), landmark_col_start,
451 num_block_columns, d_pose_counts.data().get(),
452 d_pair_offsets.data().get(), d_pairs.data().get());
453
454 thrust::sort(thrust::device, d_pairs.begin(), d_pairs.end(),
456
457 auto unique_end =
458 thrust::unique(thrust::device, d_pairs.begin(), d_pairs.end());
459 d_pairs.resize(unique_end - d_pairs.begin());
460
461 h_pairs = d_pairs;
462 for (const auto &coord : h_pairs) {
463 block_indices.emplace(coord, 0);
464 }
465 }
466 }
467
468 // Count scalar values and assign offsets after the sparsity pattern is set.
469 for (auto &entry : block_indices) {
470 entry.second = num_values;
471 num_values += graph->get_variable_dimension(entry.first.row) *
472 graph->get_variable_dimension(entry.first.col);
473 }
474
475 values.resize(num_values);
476 }
477
478 // Sets up computation for Schur += Hpl*Hll^(-1)*Hpl^T.
479 // Assumes this is called after preparing structure of Hll^(-1) .
480 // The inner dimension (the dimension of an l-block) serves as the loop bound.
481 // We store one MulOp buffer per unique block dimension combination. The other
482 // dimensions can be set dynamically at runtime. Only upper triangular blocks
483 // are computed.
484 void setup_schur_multiplication(Graph<T, S> *graph, StreamPool &streams) {
485 auto stream = streams.select(0);
486
487 for (auto &it : mul_ops) {
488 it.second.clear();
489 }
490
491 for (auto &it : h_mul_ops) {
492 it.second.clear();
493 }
494
495 if (landmark_col_start >= num_block_columns) {
496 return;
497 }
498
499 block_dims.resize(num_block_columns);
500 for (size_t b = 0; b < num_block_columns; b++) {
501 block_dims[b] = graph->get_variable_dimension(b);
502 }
503
504 S *s_values = values.data().get();
505 S *h_values = H.get_values_ptr();
506 S *diag_values = diagonal_values.data().get();
507
508 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
509 d_pose_counts.resize(num_landmark_cols);
510 d_pair_counts.resize(num_landmark_cols);
511 d_pair_offsets.resize(num_landmark_cols + 1);
512
513 constexpr size_t threads_per_block = 256;
514 const size_t count_blocks =
515 (num_landmark_cols + threads_per_block - 1) / threads_per_block;
516
517 ops::count_pose_rows_per_landmark_column_kernel<<<
518 count_blocks, threads_per_block, 0, stream>>>(
519 H.get_block_col_pointers().data().get(),
520 H.get_block_row_indices().data().get(), landmark_col_start,
521 num_block_columns, d_pose_counts.data().get());
522
523 thrust::transform(thrust::cuda::par_nosync.on(stream),
524 d_pose_counts.begin(), d_pose_counts.end(),
525 d_pair_counts.begin(), PairCountFromPoseCount{});
526
527 thrust::exclusive_scan(thrust::cuda::par_nosync.on(stream),
528 d_pair_counts.begin(), d_pair_counts.end(),
529 d_pair_offsets.begin());
530
531 const size_t total_pairs =
532 thrust::reduce(thrust::cuda::par_nosync.on(stream),
533 d_pair_counts.begin(), d_pair_counts.end(), size_t(0));
534
535 if (total_pairs == 0) {
536 d_mul_tuples.clear();
537 h_mul_tuples.clear();
538 return;
539 }
540
541 d_mul_tuples.resize(total_pairs);
542
543 ops::fill_schur_mul_tuples_kernel<<<count_blocks, threads_per_block, 0,
544 stream>>>(
545 H.get_block_col_pointers().data().get(),
546 H.get_block_row_indices().data().get(),
547 H.get_block_value_offsets().data().get(), landmark_col_start,
548 num_block_columns, d_pose_counts.data().get(),
549 d_pair_offsets.data().get(), d_mul_tuples.data().get());
550
551 cudaStreamSynchronize(stream);
552
553 h_mul_tuples = d_mul_tuples;
554
555 // Build MulOp groups on host from GPU-generated tuples.
556 for (const auto &tuple : h_mul_tuples) {
557 const size_t l = tuple.landmark_col;
558 const BlockCoordinates ll{l, l};
559 const auto mid_it = diagonal_indices.find(ll);
560 if (mid_it == diagonal_indices.end()) {
561 continue;
562 }
563
564 const size_t dim_l = block_dims[l];
565 const size_t i = tuple.pose_row_i;
566 const size_t j = tuple.pose_row_j;
567 const BlockCoordinates dst{i, j};
568 const auto dst_it = block_indices.find(dst);
569 if (dst_it == block_indices.end()) {
570 continue;
571 }
572
573 const ProductDim pd{block_dims[i], dim_l, block_dims[j]};
574 h_mul_ops[pd].push_back(ops::MulOp<S>{
575 s_values + dst_it->second,
576 h_values + tuple.left_offset,
577 diag_values + mid_it->second,
578 h_values + tuple.right_offset,
579 });
580 }
581
582 for (auto &entry : h_mul_ops) {
583 mul_ops[entry.first] = entry.second;
584 }
585 }
586
587 void execute_Hpp_copy(Graph<T, S> *graph, StreamPool &streams) {
588 (void)graph;
589 auto stream = streams.select(0);
590 thrust::fill(thrust::cuda::par_nosync.on(stream), values.begin(),
591 values.end(), static_cast<S>(0.0));
592
593 const S *h_values = H.get_values_ptr();
594 S *s_values = values.data().get();
595
596 constexpr size_t threads_per_block = 256;
597 for (const auto &entry : hpp_copy_ops) {
598 const auto &dim = entry.first;
599 const auto &ops_dev = entry.second;
600 const size_t num_ops = ops_dev.size();
601 if (num_ops == 0) {
602 continue;
603 }
604
605 const size_t total = num_ops * dim.row * dim.col;
606 const size_t blocks = (total + threads_per_block - 1) / threads_per_block;
607 ops::block_copy_batched_kernel<S>
608 <<<blocks, threads_per_block, 0, stream>>>(h_values, s_values,
609 ops_dev.data().get(),
610 num_ops, dim.row, dim.col);
611 }
612
613 cudaStreamSynchronize(stream);
614 }
615
616 void setup_Hpp_copy(Graph<T, S> *graph, StreamPool &streams) {
617 (void)streams;
618 for (auto &it : hpp_copy_ops) {
619 it.second.clear();
620 }
621 for (auto &it : h_hpp_copy_ops) {
622 it.second.clear();
623 }
624
625 for (size_t col = 0; col < landmark_col_start; col++) {
626 const size_t col_start = h_block_col_pointers[col];
627 const size_t col_end = h_block_col_pointers[col + 1];
628 for (size_t ka = col_start; ka < col_end; ka++) {
629 const size_t row = h_block_row_indices[ka];
630 const BlockCoordinates coord{row, col};
631
632 const auto out_it = block_indices.find(coord);
633 if (out_it == block_indices.end()) {
634 continue;
635 }
636
637 const MatVecDim dim{graph->get_variable_dimension(row),
638 graph->get_variable_dimension(col)};
639 h_hpp_copy_ops[dim].push_back(
640 ops::BlockCopyOp{h_block_offsets[ka], out_it->second});
641 }
642 }
643
644 for (auto &entry : h_hpp_copy_ops) {
645 hpp_copy_ops[entry.first] = entry.second;
646 }
647 }
648
649 void execute_schur_multiplication(Graph<T, S> *graph, StreamPool &streams) {
650
651 size_t stream_idx = 0;
652 constexpr size_t threads_per_block = 256;
653 for (auto &entry : mul_ops) {
654 const auto &pd = entry.first;
655 auto &ops_dev = entry.second;
656 const size_t num_ops = ops_dev.size();
657
658 if (num_ops > 0) {
659 auto stream = streams.select(stream_idx++);
660 const size_t num_threads = num_ops * pd.dim_a * pd.dim_c;
661 const size_t num_blocks =
662 (num_threads + threads_per_block - 1) / threads_per_block;
663
664 switch (pd.dim_b) {
665 case 1:
666 ops::schur_block_product_kernel_dim_b<1, T, S>
667 <<<num_blocks, threads_per_block, 0, stream>>>(
668 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
669 break;
670 case 2:
671 ops::schur_block_product_kernel_dim_b<2, T, S>
672 <<<num_blocks, threads_per_block, 0, stream>>>(
673 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
674 break;
675 case 3:
676 ops::schur_block_product_kernel_dim_b<3, T, S>
677 <<<num_blocks, threads_per_block, 0, stream>>>(
678 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
679 break;
680 case 4:
681 ops::schur_block_product_kernel_dim_b<4, T, S>
682 <<<num_blocks, threads_per_block, 0, stream>>>(
683 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
684 break;
685 case 5:
686 ops::schur_block_product_kernel_dim_b<5, T, S>
687 <<<num_blocks, threads_per_block, 0, stream>>>(
688 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
689 break;
690 case 6:
691 ops::schur_block_product_kernel_dim_b<6, T, S>
692 <<<num_blocks, threads_per_block, 0, stream>>>(
693 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
694 break;
695 case 7:
696 ops::schur_block_product_kernel_dim_b<7, T, S>
697 <<<num_blocks, threads_per_block, 0, stream>>>(
698 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
699 break;
700 case 8:
701 ops::schur_block_product_kernel_dim_b<8, T, S>
702 <<<num_blocks, threads_per_block, 0, stream>>>(
703 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
704 break;
705 case 9:
706 ops::schur_block_product_kernel_dim_b<9, T, S>
707 <<<num_blocks, threads_per_block, 0, stream>>>(
708 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
709 break;
710 case 10:
711 ops::schur_block_product_kernel_dim_b<10, T, S>
712 <<<num_blocks, threads_per_block, 0, stream>>>(
713 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
714 break;
715 case 11:
716 ops::schur_block_product_kernel_dim_b<11, T, S>
717 <<<num_blocks, threads_per_block, 0, stream>>>(
718 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
719 break;
720 case 12:
721 ops::schur_block_product_kernel_dim_b<12, T, S>
722 <<<num_blocks, threads_per_block, 0, stream>>>(
723 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
724 break;
725 default:
726 ops::schur_block_product_kernel<T, S>
727 <<<num_blocks, threads_per_block, 0, stream>>>(
728 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_b, pd.dim_c);
729 break;
730 }
731 }
732 }
733 streams.sync_all();
734 }
735
736 /*
737 Sets up an operation for Hll^(-1) multiplying a landmark-sized vector.
738 We need to do this twice: once for computing b_Schur and another time for
739 computing the landmark update dx_l.
740 */
741 void setup_diagonal_inverse_multiply(Graph<T, S> *graph,
742 StreamPool &streams) {
743 (void)streams;
744 const auto &offsets = graph->get_offset_vector();
745 diagonal_value_offsets_by_dim.clear();
746 landmark_local_offsets_by_dim.clear();
747
748 if (landmark_col_start >= offsets.size() - 1) {
749 l_workspace.clear();
750 return;
751 }
752
753 l_workspace.resize(landmark_dim);
754 diagonal_workspace.resize(landmark_dim);
755
756 for (const auto &entry : diagonal_coords_by_dim) {
757 const size_t dim = entry.first;
758 const auto &coords = entry.second;
759
760 h_diagonal_offsets.resize(coords.size());
761 h_local_offsets.resize(coords.size());
762
763 for (size_t i = 0; i < coords.size(); i++) {
764 const auto &coord = coords[i];
765 h_diagonal_offsets[i] = diagonal_indices.at(coord);
766 h_local_offsets[i] = offsets[coord.col] - pose_dim;
767 }
768
769 diagonal_value_offsets_by_dim[dim] = h_diagonal_offsets;
770 landmark_local_offsets_by_dim[dim] = h_local_offsets;
771 }
772 }
773
774 void execute_diagonal_inverse_multiply(Graph<T, S> *graph,
775 StreamPool &streams, T *vec_out,
776 T *vec_in) {
777 auto stream = streams.select(0);
778
779 constexpr size_t threads_per_block = 256;
780 const P *diag_values_ptr = diagonal_values.data().get();
781 T *kernel_out = vec_out;
782 if (vec_out == vec_in) {
783 kernel_out = diagonal_workspace.data().get();
784 }
785
786 for (const auto &entry : diagonal_value_offsets_by_dim) {
787 const size_t dim = entry.first;
788 const auto &a_offsets_dev = entry.second;
789 const auto vec_it = landmark_local_offsets_by_dim.find(dim);
790 if (vec_it == landmark_local_offsets_by_dim.end()) {
791 continue;
792 }
793 const auto &vec_offsets_dev = vec_it->second;
794 const size_t num_blocks = a_offsets_dev.size();
795 if (num_blocks == 0) {
796 continue;
797 }
798
799 const size_t total_rows = num_blocks * dim;
800 const size_t blocks =
801 (total_rows + threads_per_block - 1) / threads_per_block;
802
803 ops::block_matvec_assign_batched_kernel<T, P, T>
804 <<<blocks, threads_per_block, 0, stream>>>(
805 diag_values_ptr, a_offsets_dev.data().get(), vec_in, kernel_out,
806 vec_offsets_dev.data().get(), num_blocks, dim);
807 }
808
809 if (vec_out == vec_in) {
810 thrust::copy_n(thrust::cuda::par_nosync.on(stream), kernel_out,
811 landmark_dim, vec_out);
812 }
813
814 cudaStreamSynchronize(stream);
815 }
816
817 void setup_Hpl_vector_multiply(Graph<T, S> *graph, StreamPool &streams) {
818 (void)streams;
819 const auto &offsets = graph->get_offset_vector();
820 p_workspace.resize(pose_dim);
821
822 for (auto &it : hpl_vec_ops) {
823 it.second.clear();
824 }
825 for (auto &it : h_hpl_vec_ops) {
826 it.second.clear();
827 }
828
829 if (landmark_col_start >= num_block_columns) {
830 return;
831 }
832
833 for (size_t l = landmark_col_start; l < num_block_columns; l++) {
834 const size_t col_start = h_block_col_pointers[l];
835 const size_t col_end = h_block_col_pointers[l + 1];
836 const size_t dim_l = graph->get_variable_dimension(l);
837 const size_t x_offset = offsets[l] - pose_dim;
838
839 for (size_t ka = col_start; ka < col_end; ka++) {
840 const size_t i = h_block_row_indices[ka];
841 if (i >= landmark_col_start) {
842 break;
843 }
844
845 const size_t dim_i = graph->get_variable_dimension(i);
846 const MatVecDim mvd{dim_i, dim_l};
847 h_hpl_vec_ops[mvd].push_back(
848 ops::HplMatVecOp{h_block_offsets[ka], x_offset, offsets[i]});
849 }
850 }
851
852 for (auto &entry : h_hpl_vec_ops) {
853 hpl_vec_ops[entry.first] = entry.second;
854 }
855 }
856
857 void execute_Hpl_vector_multiply(Graph<T, S> *graph, StreamPool &streams,
858 T *vec_out, T *vec_in) {
859 auto stream = streams.select(0);
860 const auto &offsets = graph->get_offset_vector();
861
862 thrust::fill(thrust::cuda::par.on(stream), vec_out, vec_out + pose_dim,
863 static_cast<T>(0));
864
865 const S *h_values = H.get_values_ptr();
866
867 constexpr size_t threads_per_block = 256;
868 for (const auto &entry : hpl_vec_ops) {
869 const auto &mvd = entry.first;
870 const auto &ops_dev = entry.second;
871 const size_t num_ops = ops_dev.size();
872 if (num_ops > 0) {
873 const size_t total_rows = num_ops * mvd.row;
874 const size_t blocks =
875 (total_rows + threads_per_block - 1) / threads_per_block;
876 ops::block_matvec_add_batched_kernel<T, S>
877 <<<blocks, threads_per_block, 0, stream>>>(
878 h_values, ops_dev.data().get(), num_ops, vec_in, vec_out,
879 mvd.row, mvd.col);
880 }
881 }
882
883 cudaStreamSynchronize(stream);
884 }
885
886 void setup_b_Schur_computation(Graph<T, S> *graph, StreamPool &streams) {
887 (void)streams;
888 const auto &offsets = graph->get_offset_vector();
889 if (landmark_col_start >= offsets.size()) {
890 b_Schur.clear();
891 p_workspace.clear();
892 l_workspace.clear();
893 return;
894 }
895
896 b_Schur.resize(pose_dim);
897 p_workspace.resize(pose_dim);
898 l_workspace.resize(landmark_dim);
899 }
900
901 void execute_b_Schur_computation(Graph<T, S> *graph, StreamPool &streams,
902 T *b_Schur) {
903 auto stream = streams.select(0);
904 const auto &offsets = graph->get_offset_vector();
905
906 T *b = graph->get_b().data().get();
907
908 thrust::copy_n(thrust::cuda::par.on(stream), b, pose_dim, b_Schur);
909
910 execute_diagonal_inverse_multiply(graph, streams, l_workspace.data().get(),
911 b + pose_dim);
912
913 execute_Hpl_vector_multiply(graph, streams, p_workspace.data().get(),
914 l_workspace.data().get());
915
916 ops::axpy_async(stream, pose_dim, b_Schur, static_cast<T>(-1.0),
917 p_workspace.data().get(), b_Schur);
918
919 cudaStreamSynchronize(stream);
920 }
921
922 void setup_HplT_vector_multiply(Graph<T, S> *graph, StreamPool &streams) {
923 (void)streams;
924 const auto &offsets = graph->get_offset_vector();
925 for (auto &it : hplt_vec_ops) {
926 it.second.clear();
927 }
928 for (auto &it : h_hplt_vec_ops) {
929 it.second.clear();
930 }
931
932 if (landmark_col_start >= offsets.size() - 1) {
933 l_workspace.clear();
934 return;
935 }
936 l_workspace.resize(landmark_dim);
937
938 if (landmark_col_start >= num_block_columns) {
939 return;
940 }
941
942 for (size_t l = landmark_col_start; l < num_block_columns; l++) {
943 const size_t col_start = h_block_col_pointers[l];
944 const size_t col_end = h_block_col_pointers[l + 1];
945 const size_t dim_l = graph->get_variable_dimension(l);
946 const size_t y_offset = offsets[l] - pose_dim;
947
948 for (size_t ka = col_start; ka < col_end; ka++) {
949 const size_t i = h_block_row_indices[ka];
950 if (i >= landmark_col_start) {
951 break;
952 }
953
954 const size_t dim_i = graph->get_variable_dimension(i);
955 const MatVecDim mvd{dim_i, dim_l};
956 h_hplt_vec_ops[mvd].push_back(
957 ops::HplMatVecOp{h_block_offsets[ka], offsets[i], y_offset});
958 }
959 }
960
961 for (auto &entry : h_hplt_vec_ops) {
962 hplt_vec_ops[entry.first] = entry.second;
963 }
964 }
965
966 void execute_HplT_vector_multiply(Graph<T, S> *graph, StreamPool &streams,
967 T *vec_out, T *vec_in) {
968 auto stream = streams.select(0);
969 const auto &offsets = graph->get_offset_vector();
970 if (landmark_col_start >= num_block_columns) {
971 return;
972 }
973
974 thrust::fill(thrust::cuda::par_nosync.on(stream), vec_out,
975 vec_out + landmark_dim, static_cast<T>(0));
976
977 const S *h_values = H.get_values_ptr();
978
979 constexpr size_t threads_per_block = 256;
980 for (const auto &entry : hplt_vec_ops) {
981 const auto &mvd = entry.first;
982 const auto &ops_dev = entry.second;
983 const size_t num_ops = ops_dev.size();
984 if (num_ops == 0) {
985 continue;
986 }
987
988 const size_t total_cols = num_ops * mvd.col;
989 const size_t blocks =
990 (total_cols + threads_per_block - 1) / threads_per_block;
991 ops::block_matvec_transpose_add_batched_kernel<T, S>
992 <<<blocks, threads_per_block, 0, stream>>>(
993 h_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
994 mvd.col);
995 }
996
997 cudaStreamSynchronize(stream);
998 }
999
1000 // Builds the structure of Hll^(-1)
1001 void build_diagonal_structure(Graph<T, S> *graph, StreamPool &streams) {
1002 diagonal_indices.clear();
1003 diagonal_coords_by_dim.clear();
1004
1005 // Keep only l-diagonal blocks from Hessian structure.
1006 size_t diagonal_num_values = 0;
1007 for (size_t block_col = landmark_col_start; block_col < num_block_columns;
1008 block_col++) {
1009 const BlockCoordinates coord{block_col, block_col};
1010 diagonal_indices.emplace(coord, diagonal_num_values);
1011 const size_t dim = graph->get_variable_dimension(block_col);
1012 diagonal_coords_by_dim[dim].push_back(coord);
1013 diagonal_num_values += dim * dim;
1014 }
1015
1016 diagonal_values.resize(diagonal_num_values);
1017 setup_batched_inverse_buffers(streams);
1018 }
1019
1020 void setup_batched_inverse_buffers(StreamPool &streams) {
1021 auto stream = streams.select(0);
1022 P *diag_values = diagonal_values.data().get();
1023 const P *h_values = H.get_values_ptr();
1024
1025 A_ptrs.clear();
1026 Ainv_ptrs.clear();
1027 A_ptrs_device_by_dim.clear();
1028 Ainv_ptrs_device_by_dim.clear();
1029 info_by_dim.clear();
1030
1031 for (auto &entry : diagonal_coords_by_dim) {
1032 const size_t dim = entry.first;
1033 auto &coords = entry.second;
1034 const size_t num_blocks = coords.size();
1035
1036 auto &A_ptrs_device = A_ptrs_device_by_dim[dim];
1037 auto &Ainv_ptrs_device = Ainv_ptrs_device_by_dim[dim];
1038 auto &info = info_by_dim[dim];
1039
1040 A_ptrs.resize(num_blocks);
1041 Ainv_ptrs.resize(num_blocks);
1042 A_ptrs_device.resize(num_blocks);
1043 Ainv_ptrs_device.resize(num_blocks);
1044 info.resize(num_blocks);
1045
1046 for (size_t i = 0; i < num_blocks; i++) {
1047 const auto &coord = coords[i];
1048 const size_t src_offset = H.block_indices.at(coord);
1049 const size_t dst_offset = diagonal_indices.at(coord);
1050 this->A_ptrs[i] = const_cast<P *>(h_values + src_offset);
1051 this->Ainv_ptrs[i] = diag_values + dst_offset;
1052 }
1053
1054 cudaMemcpyAsync(A_ptrs_device.data().get(), this->A_ptrs.data(),
1055 sizeof(P *) * num_blocks, cudaMemcpyHostToDevice, stream);
1056 cudaMemcpyAsync(Ainv_ptrs_device.data().get(), this->Ainv_ptrs.data(),
1057 sizeof(P *) * num_blocks, cudaMemcpyHostToDevice, stream);
1058
1059 cudaStreamSynchronize(stream);
1060
1061 // Clear for reuse
1062 A_ptrs.clear();
1063 Ainv_ptrs.clear();
1064 }
1065 }
1066
1067 void execute_block_diagonal_inversion(Graph<T, S> *graph,
1068 StreamPool &streams) {
1069 (void)graph;
1070 auto stream = streams.select(0);
1071 cublasSetStream(handle, stream);
1072
1073 // cublas<t>matinvBatched only supports small matrices.
1074 constexpr size_t max_supported_dim = 32;
1075
1076 for (auto &entry : diagonal_coords_by_dim) {
1077 const size_t block_dim = entry.first;
1078 if (block_dim > max_supported_dim) {
1079 std::cerr << "Runtime error: Schur diagonal inversion received block "
1080 "dimension "
1081 << block_dim
1082 << ", but cublas matinvBatched supports at "
1083 "most "
1084 << max_supported_dim << "." << std::endl;
1085 break;
1086 }
1087
1088 auto &A_ptrs_device = A_ptrs_device_by_dim[block_dim];
1089 auto &Ainv_ptrs_device = Ainv_ptrs_device_by_dim[block_dim];
1090 auto &info = info_by_dim[block_dim];
1091 const int num_blocks = static_cast<int>(entry.second.size());
1092
1093 // if (ops::launch_small_block_inverse_batched<P>(
1094 // block_dim, A_ptrs_device.data().get(),
1095 // Ainv_ptrs_device.data().get(), static_cast<size_t>(num_blocks),
1096 // stream)) {
1097 // continue;
1098 // }
1099
1100 if constexpr (std::is_same<P, double>::value) {
1101 cublasDmatinvBatched(
1102 handle, static_cast<int>(block_dim), A_ptrs_device.data().get(),
1103 static_cast<int>(block_dim), Ainv_ptrs_device.data().get(),
1104 static_cast<int>(block_dim), info.data().get(), num_blocks);
1105 } else if constexpr (std::is_same<P, float>::value) {
1106 cublasSmatinvBatched(
1107 handle, static_cast<int>(block_dim), A_ptrs_device.data().get(),
1108 static_cast<int>(block_dim), Ainv_ptrs_device.data().get(),
1109 static_cast<int>(block_dim), info.data().get(), num_blocks);
1110 }
1111 }
1112
1113 cudaStreamSynchronize(stream);
1114 }
1115};
1116
1117} // namespace graphite
Definition block.hpp:6
Class for computing the explicit Schur complement.
Definition schur.hpp:87
Definition stream.hpp:7
The top-level namespace for Graphite.
Definition eigen_solver.cpp:4
Definition schur.hpp:57
Definition schur.hpp:51
Definition schur.hpp:39
Definition schur.hpp:27
Definition schur.hpp:21
Stores offsets for Hpl*Hll^(-1)*Hpl^T operation.
Definition schur.hpp:14