90 cublasCreate(&handle);
91 cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_DEVICE);
92 lowest_eliminated_block_col = std::numeric_limits<size_t>::max();
97 size_t lowest_eliminated_block_col;
98 size_t pose_col_start;
99 size_t landmark_col_start;
100 size_t num_block_columns;
103 thrust::device_vector<S> values;
104 using P = std::conditional_t<is_low_precision<S>::value, T,
109 !is_low_precision<S>::value,
110 "SchurComplement does not currently support low precision types.");
112 std::is_same<T, S>::value,
113 "SchurComplement currently requires T and S to be the same type.");
114 thrust::device_vector<P>
118 std::unordered_map<BlockCoordinates, size_t> block_indices;
119 std::unordered_map<BlockCoordinates, size_t>
121 std::unordered_map<size_t, std::vector<BlockCoordinates>>
122 diagonal_coords_by_dim;
125 std::unordered_map<ProductDim, thrust::device_vector<ops::MulOp<S>>> mul_ops;
126 std::unordered_map<ProductDim, std::vector<ops::MulOp<S>>> h_mul_ops;
127 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
129 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_hpl_vec_ops;
130 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
132 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_hplt_vec_ops;
133 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
135 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>> h_schur_vec_ops;
136 std::unordered_map<MatVecDim, thrust::device_vector<ops::HplMatVecOp>>
138 std::unordered_map<MatVecDim, std::vector<ops::HplMatVecOp>>
140 std::unordered_map<MatVecDim, thrust::device_vector<ops::BlockCopyOp>>
142 std::unordered_map<MatVecDim, std::vector<ops::BlockCopyOp>> h_hpp_copy_ops;
145 thrust::host_vector<P *> A_ptrs;
146 thrust::host_vector<P *> Ainv_ptrs;
147 std::unordered_map<size_t, thrust::device_vector<P *>> A_ptrs_device_by_dim;
148 std::unordered_map<size_t, thrust::device_vector<P *>>
149 Ainv_ptrs_device_by_dim;
150 std::unordered_map<size_t, thrust::device_vector<int>> info_by_dim;
151 std::unordered_map<size_t, thrust::device_vector<size_t>>
152 diagonal_value_offsets_by_dim;
153 std::unordered_map<size_t, thrust::device_vector<size_t>>
154 landmark_local_offsets_by_dim;
155 thrust::host_vector<size_t> h_block_col_pointers;
156 thrust::host_vector<size_t> h_block_row_indices;
157 thrust::host_vector<size_t> h_block_offsets;
160 thrust::device_vector<size_t> d_col_pointers;
161 thrust::device_vector<size_t> d_row_indices;
162 thrust::device_vector<size_t> d_offsets;
163 thrust::device_vector<size_t> d_schur_offsets;
164 thrust::device_vector<size_t> scalar_to_block_map;
167 thrust::device_vector<T> b_Schur;
168 thrust::device_vector<T> l_workspace;
169 thrust::device_vector<T> p_workspace;
170 thrust::device_vector<T> diagonal_workspace;
173 thrust::device_vector<size_t> d_pose_counts;
174 thrust::device_vector<size_t> d_pair_counts;
175 thrust::device_vector<size_t> d_pair_offsets;
176 thrust::device_vector<BlockCoordinates> d_pairs;
177 thrust::host_vector<BlockCoordinates> h_pairs;
178 thrust::device_vector<ops::SchurMulTuple> d_mul_tuples;
179 thrust::host_vector<ops::SchurMulTuple> h_mul_tuples;
182 std::vector<size_t> block_dims;
185 thrust::host_vector<size_t> h_diagonal_offsets;
186 thrust::host_vector<size_t> h_local_offsets;
189 thrust::host_vector<size_t> h_schur_offsets;
190 std::vector<BlockCoordinates> block_coords;
192 cublasHandle_t handle;
194 void build_structure(Graph<T, S> *graph,
StreamPool &streams) {
195 h_block_col_pointers = H.get_block_col_pointers();
196 h_block_row_indices = H.get_block_row_indices();
197 h_block_offsets = H.get_block_value_offsets();
198 const auto &vertex_descriptors = graph->get_vertex_descriptors();
200 lowest_eliminated_block_col = graph->get_elimination_block_column();
202 landmark_col_start = lowest_eliminated_block_col;
203 num_block_columns = graph->get_num_block_columns();
204 pose_dim = graph->get_offset_vector()[landmark_col_start];
205 landmark_dim = graph->get_hessian_dimension() - pose_dim;
206 b_Schur.resize(pose_dim);
208 build_schur_structure(graph, streams);
210 setup_Hpp_copy(graph, streams);
212 build_indices(graph, streams);
214 build_diagonal_structure(graph, streams);
216 setup_schur_multiplication(graph, streams);
218 setup_diagonal_inverse_multiply(graph, streams);
220 setup_Hpl_vector_multiply(graph, streams);
222 setup_HplT_vector_multiply(graph, streams);
224 setup_b_Schur_computation(graph, streams);
227 void update_values(Graph<T, S> *graph,
StreamPool &streams) {
228 execute_Hpp_copy(graph, streams);
230 execute_block_diagonal_inversion(graph, streams);
232 execute_schur_multiplication(graph, streams);
234 execute_b_Schur_computation(graph, streams, b_Schur.data().get());
237 thrust::device_vector<T> &get_b_Schur() {
return b_Schur; }
239 void build_indices(Graph<T, S> *graph,
StreamPool &streams) {
241 block_coords.clear();
242 block_coords.reserve(block_indices.size());
243 for (
const auto &entry : block_indices) {
244 block_coords.push_back(entry.first);
247 csc::build_block_csc_indices(landmark_col_start, block_indices,
248 block_coords, d_col_pointers, d_row_indices,
252 h_schur_offsets.resize(landmark_col_start + 1);
253 h_schur_offsets[0] = 0;
254 for (
size_t bc = 0; bc < landmark_col_start; bc++) {
255 h_schur_offsets[bc + 1] =
256 h_schur_offsets[bc] + graph->get_variable_dimension(bc);
258 d_schur_offsets = h_schur_offsets;
259 csc::build_scalar_to_block_map(d_schur_offsets, landmark_col_start,
260 scalar_to_block_map);
263 template <
typename I>
264 void build_csc_structure(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
266 csc::build_scalar_csc_structure<S, I>(pose_dim, d_col_pointers,
267 d_row_indices, d_schur_offsets,
268 scalar_to_block_map, matrix);
271 template <
typename I>
272 void update_csc_values(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
274 csc::update_scalar_csc_values<S, I>(
275 pose_dim, values, d_col_pointers, d_row_indices, d_offsets,
276 d_schur_offsets, scalar_to_block_map, matrix);
279 void compute_landmark_update(Graph<T, S> *graph,
StreamPool &streams, T *xl,
281 auto stream = streams.select(0);
282 const auto &offsets = graph->get_offset_vector();
283 if (landmark_col_start >= num_block_columns) {
287 T *b = graph->get_b().data().get();
290 execute_HplT_vector_multiply(graph, streams, l_workspace.data().get(), xp);
293 thrust::copy_n(thrust::cuda::par_nosync.on(stream), b + pose_dim,
295 ops::axpy_async(stream, landmark_dim, xl,
static_cast<T
>(-1),
296 l_workspace.data().get(), xl);
299 execute_diagonal_inverse_multiply(graph, streams, xl, xl);
301 cudaStreamSynchronize(stream);
307 void setup_schur_vector_multiply(Graph<T, S> *graph,
StreamPool &streams) {
309 const auto &offsets = graph->get_offset_vector();
311 schur_vec_ops.clear();
312 h_schur_vec_ops.clear();
313 schur_vec_t_ops.clear();
314 h_schur_vec_t_ops.clear();
316 for (
const auto &entry : block_indices) {
317 const auto &coord = entry.first;
318 const size_t row = coord.row;
319 const size_t col = coord.col;
321 if (row >= landmark_col_start || col >= landmark_col_start) {
325 const size_t row_dim = graph->get_variable_dimension(row);
326 const size_t col_dim = graph->get_variable_dimension(col);
329 h_schur_vec_ops[mvd].push_back(
333 h_schur_vec_t_ops[mvd].push_back(
338 for (
const auto &entry : h_schur_vec_ops) {
339 schur_vec_ops[entry.first] = entry.second;
341 for (
const auto &entry : h_schur_vec_t_ops) {
342 schur_vec_t_ops[entry.first] = entry.second;
347 void execute_schur_vector_multiply(Graph<T, S> *graph,
StreamPool &streams,
348 T *vec_out,
const T *vec_in) {
350 auto stream = streams.select(0);
352 thrust::fill(thrust::cuda::par_nosync.on(stream), vec_out,
353 vec_out + pose_dim,
static_cast<T
>(0));
355 const S *s_values = values.data().get();
356 constexpr size_t threads_per_block = 256;
358 for (
const auto &entry : schur_vec_ops) {
359 const auto &mvd = entry.first;
360 const auto &ops_dev = entry.second;
361 const size_t num_ops = ops_dev.size();
366 const size_t total_rows = num_ops * mvd.row;
367 const size_t blocks =
368 (total_rows + threads_per_block - 1) / threads_per_block;
369 ops::block_matvec_add_batched_kernel<T, S>
370 <<<blocks, threads_per_block, 0, stream>>>(
371 s_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
375 for (
const auto &entry : schur_vec_t_ops) {
376 const auto &mvd = entry.first;
377 const auto &ops_dev = entry.second;
378 const size_t num_ops = ops_dev.size();
383 const size_t total_cols = num_ops * mvd.col;
384 const size_t blocks =
385 (total_cols + threads_per_block - 1) / threads_per_block;
386 ops::block_matvec_transpose_add_batched_kernel<T, S>
387 <<<blocks, threads_per_block, 0, stream>>>(
388 s_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
392 cudaStreamSynchronize(stream);
397 void build_schur_structure(Graph<T, S> *graph,
StreamPool &streams) {
398 size_t num_values = 0;
399 block_indices.clear();
405 for (
size_t col = 0; col < landmark_col_start; col++) {
406 const size_t col_start = h_block_col_pointers[col];
407 const size_t col_end = h_block_col_pointers[col + 1];
408 for (
size_t ka = col_start; ka < col_end; ka++) {
417 if (landmark_col_start < num_block_columns) {
418 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
419 d_pose_counts.resize(num_landmark_cols);
420 d_pair_counts.resize(num_landmark_cols);
421 d_pair_offsets.resize(num_landmark_cols + 1);
423 constexpr size_t threads_per_block = 256;
424 const size_t count_blocks =
425 (num_landmark_cols + threads_per_block - 1) / threads_per_block;
427 ops::count_pose_rows_per_landmark_column_kernel<<<count_blocks,
428 threads_per_block>>>(
429 H.get_block_col_pointers().data().get(),
430 H.get_block_row_indices().data().get(), landmark_col_start,
431 num_block_columns, d_pose_counts.data().get());
433 thrust::transform(thrust::device, d_pose_counts.begin(),
434 d_pose_counts.end(), d_pair_counts.begin(),
437 thrust::exclusive_scan(thrust::device, d_pair_counts.begin(),
438 d_pair_counts.end(), d_pair_offsets.begin());
440 const size_t total_pairs =
441 thrust::reduce(thrust::device, d_pair_counts.begin(),
442 d_pair_counts.end(),
size_t(0));
443 d_pair_offsets[num_landmark_cols] = total_pairs;
445 if (total_pairs > 0) {
446 d_pairs.resize(total_pairs);
447 ops::fill_schur_structure_pairs_kernel<<<count_blocks,
448 threads_per_block>>>(
449 H.get_block_col_pointers().data().get(),
450 H.get_block_row_indices().data().get(), landmark_col_start,
451 num_block_columns, d_pose_counts.data().get(),
452 d_pair_offsets.data().get(), d_pairs.data().get());
454 thrust::sort(thrust::device, d_pairs.begin(), d_pairs.end(),
458 thrust::unique(thrust::device, d_pairs.begin(), d_pairs.end());
459 d_pairs.resize(unique_end - d_pairs.begin());
462 for (
const auto &coord : h_pairs) {
463 block_indices.emplace(coord, 0);
469 for (
auto &entry : block_indices) {
470 entry.second = num_values;
471 num_values += graph->get_variable_dimension(entry.first.row) *
472 graph->get_variable_dimension(entry.first.col);
475 values.resize(num_values);
484 void setup_schur_multiplication(Graph<T, S> *graph,
StreamPool &streams) {
485 auto stream = streams.select(0);
487 for (
auto &it : mul_ops) {
491 for (
auto &it : h_mul_ops) {
495 if (landmark_col_start >= num_block_columns) {
499 block_dims.resize(num_block_columns);
500 for (
size_t b = 0; b < num_block_columns; b++) {
501 block_dims[b] = graph->get_variable_dimension(b);
504 S *s_values = values.data().get();
505 S *h_values = H.get_values_ptr();
506 S *diag_values = diagonal_values.data().get();
508 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
509 d_pose_counts.resize(num_landmark_cols);
510 d_pair_counts.resize(num_landmark_cols);
511 d_pair_offsets.resize(num_landmark_cols + 1);
513 constexpr size_t threads_per_block = 256;
514 const size_t count_blocks =
515 (num_landmark_cols + threads_per_block - 1) / threads_per_block;
517 ops::count_pose_rows_per_landmark_column_kernel<<<
518 count_blocks, threads_per_block, 0, stream>>>(
519 H.get_block_col_pointers().data().get(),
520 H.get_block_row_indices().data().get(), landmark_col_start,
521 num_block_columns, d_pose_counts.data().get());
523 thrust::transform(thrust::cuda::par_nosync.on(stream),
524 d_pose_counts.begin(), d_pose_counts.end(),
527 thrust::exclusive_scan(thrust::cuda::par_nosync.on(stream),
528 d_pair_counts.begin(), d_pair_counts.end(),
529 d_pair_offsets.begin());
531 const size_t total_pairs =
532 thrust::reduce(thrust::cuda::par_nosync.on(stream),
533 d_pair_counts.begin(), d_pair_counts.end(),
size_t(0));
535 if (total_pairs == 0) {
536 d_mul_tuples.clear();
537 h_mul_tuples.clear();
541 d_mul_tuples.resize(total_pairs);
543 ops::fill_schur_mul_tuples_kernel<<<count_blocks, threads_per_block, 0,
545 H.get_block_col_pointers().data().get(),
546 H.get_block_row_indices().data().get(),
547 H.get_block_value_offsets().data().get(), landmark_col_start,
548 num_block_columns, d_pose_counts.data().get(),
549 d_pair_offsets.data().get(), d_mul_tuples.data().get());
551 cudaStreamSynchronize(stream);
553 h_mul_tuples = d_mul_tuples;
556 for (
const auto &tuple : h_mul_tuples) {
557 const size_t l = tuple.landmark_col;
559 const auto mid_it = diagonal_indices.find(ll);
560 if (mid_it == diagonal_indices.end()) {
564 const size_t dim_l = block_dims[l];
565 const size_t i = tuple.pose_row_i;
566 const size_t j = tuple.pose_row_j;
568 const auto dst_it = block_indices.find(dst);
569 if (dst_it == block_indices.end()) {
573 const ProductDim pd{block_dims[i], dim_l, block_dims[j]};
575 s_values + dst_it->second,
576 h_values + tuple.left_offset,
577 diag_values + mid_it->second,
578 h_values + tuple.right_offset,
582 for (
auto &entry : h_mul_ops) {
583 mul_ops[entry.first] = entry.second;
587 void execute_Hpp_copy(Graph<T, S> *graph,
StreamPool &streams) {
589 auto stream = streams.select(0);
590 thrust::fill(thrust::cuda::par_nosync.on(stream), values.begin(),
591 values.end(),
static_cast<S
>(0.0));
593 const S *h_values = H.get_values_ptr();
594 S *s_values = values.data().get();
596 constexpr size_t threads_per_block = 256;
597 for (
const auto &entry : hpp_copy_ops) {
598 const auto &dim = entry.first;
599 const auto &ops_dev = entry.second;
600 const size_t num_ops = ops_dev.size();
605 const size_t total = num_ops * dim.row * dim.col;
606 const size_t blocks = (total + threads_per_block - 1) / threads_per_block;
607 ops::block_copy_batched_kernel<S>
608 <<<blocks, threads_per_block, 0, stream>>>(h_values, s_values,
609 ops_dev.data().get(),
610 num_ops, dim.row, dim.col);
613 cudaStreamSynchronize(stream);
616 void setup_Hpp_copy(Graph<T, S> *graph,
StreamPool &streams) {
618 for (
auto &it : hpp_copy_ops) {
621 for (
auto &it : h_hpp_copy_ops) {
625 for (
size_t col = 0; col < landmark_col_start; col++) {
626 const size_t col_start = h_block_col_pointers[col];
627 const size_t col_end = h_block_col_pointers[col + 1];
628 for (
size_t ka = col_start; ka < col_end; ka++) {
629 const size_t row = h_block_row_indices[ka];
632 const auto out_it = block_indices.find(coord);
633 if (out_it == block_indices.end()) {
637 const MatVecDim dim{graph->get_variable_dimension(row),
638 graph->get_variable_dimension(col)};
639 h_hpp_copy_ops[dim].push_back(
644 for (
auto &entry : h_hpp_copy_ops) {
645 hpp_copy_ops[entry.first] = entry.second;
649 void execute_schur_multiplication(Graph<T, S> *graph,
StreamPool &streams) {
651 size_t stream_idx = 0;
652 constexpr size_t threads_per_block = 256;
653 for (
auto &entry : mul_ops) {
654 const auto &pd = entry.first;
655 auto &ops_dev = entry.second;
656 const size_t num_ops = ops_dev.size();
659 auto stream = streams.select(stream_idx++);
660 const size_t num_threads = num_ops * pd.dim_a * pd.dim_c;
661 const size_t num_blocks =
662 (num_threads + threads_per_block - 1) / threads_per_block;
666 ops::schur_block_product_kernel_dim_b<1, T, S>
667 <<<num_blocks, threads_per_block, 0, stream>>>(
668 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
671 ops::schur_block_product_kernel_dim_b<2, T, S>
672 <<<num_blocks, threads_per_block, 0, stream>>>(
673 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
676 ops::schur_block_product_kernel_dim_b<3, T, S>
677 <<<num_blocks, threads_per_block, 0, stream>>>(
678 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
681 ops::schur_block_product_kernel_dim_b<4, T, S>
682 <<<num_blocks, threads_per_block, 0, stream>>>(
683 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
686 ops::schur_block_product_kernel_dim_b<5, T, S>
687 <<<num_blocks, threads_per_block, 0, stream>>>(
688 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
691 ops::schur_block_product_kernel_dim_b<6, T, S>
692 <<<num_blocks, threads_per_block, 0, stream>>>(
693 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
696 ops::schur_block_product_kernel_dim_b<7, T, S>
697 <<<num_blocks, threads_per_block, 0, stream>>>(
698 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
701 ops::schur_block_product_kernel_dim_b<8, T, S>
702 <<<num_blocks, threads_per_block, 0, stream>>>(
703 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
706 ops::schur_block_product_kernel_dim_b<9, T, S>
707 <<<num_blocks, threads_per_block, 0, stream>>>(
708 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
711 ops::schur_block_product_kernel_dim_b<10, T, S>
712 <<<num_blocks, threads_per_block, 0, stream>>>(
713 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
716 ops::schur_block_product_kernel_dim_b<11, T, S>
717 <<<num_blocks, threads_per_block, 0, stream>>>(
718 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
721 ops::schur_block_product_kernel_dim_b<12, T, S>
722 <<<num_blocks, threads_per_block, 0, stream>>>(
723 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_c);
726 ops::schur_block_product_kernel<T, S>
727 <<<num_blocks, threads_per_block, 0, stream>>>(
728 ops_dev.data().get(), num_ops, pd.dim_a, pd.dim_b, pd.dim_c);
741 void setup_diagonal_inverse_multiply(Graph<T, S> *graph,
744 const auto &offsets = graph->get_offset_vector();
745 diagonal_value_offsets_by_dim.clear();
746 landmark_local_offsets_by_dim.clear();
748 if (landmark_col_start >= offsets.size() - 1) {
753 l_workspace.resize(landmark_dim);
754 diagonal_workspace.resize(landmark_dim);
756 for (
const auto &entry : diagonal_coords_by_dim) {
757 const size_t dim = entry.first;
758 const auto &coords = entry.second;
760 h_diagonal_offsets.resize(coords.size());
761 h_local_offsets.resize(coords.size());
763 for (
size_t i = 0; i < coords.size(); i++) {
764 const auto &coord = coords[i];
765 h_diagonal_offsets[i] = diagonal_indices.at(coord);
766 h_local_offsets[i] = offsets[coord.col] - pose_dim;
769 diagonal_value_offsets_by_dim[dim] = h_diagonal_offsets;
770 landmark_local_offsets_by_dim[dim] = h_local_offsets;
774 void execute_diagonal_inverse_multiply(Graph<T, S> *graph,
777 auto stream = streams.select(0);
779 constexpr size_t threads_per_block = 256;
780 const P *diag_values_ptr = diagonal_values.data().get();
781 T *kernel_out = vec_out;
782 if (vec_out == vec_in) {
783 kernel_out = diagonal_workspace.data().get();
786 for (
const auto &entry : diagonal_value_offsets_by_dim) {
787 const size_t dim = entry.first;
788 const auto &a_offsets_dev = entry.second;
789 const auto vec_it = landmark_local_offsets_by_dim.find(dim);
790 if (vec_it == landmark_local_offsets_by_dim.end()) {
793 const auto &vec_offsets_dev = vec_it->second;
794 const size_t num_blocks = a_offsets_dev.size();
795 if (num_blocks == 0) {
799 const size_t total_rows = num_blocks * dim;
800 const size_t blocks =
801 (total_rows + threads_per_block - 1) / threads_per_block;
803 ops::block_matvec_assign_batched_kernel<T, P, T>
804 <<<blocks, threads_per_block, 0, stream>>>(
805 diag_values_ptr, a_offsets_dev.data().get(), vec_in, kernel_out,
806 vec_offsets_dev.data().get(), num_blocks, dim);
809 if (vec_out == vec_in) {
810 thrust::copy_n(thrust::cuda::par_nosync.on(stream), kernel_out,
811 landmark_dim, vec_out);
814 cudaStreamSynchronize(stream);
817 void setup_Hpl_vector_multiply(Graph<T, S> *graph,
StreamPool &streams) {
819 const auto &offsets = graph->get_offset_vector();
820 p_workspace.resize(pose_dim);
822 for (
auto &it : hpl_vec_ops) {
825 for (
auto &it : h_hpl_vec_ops) {
829 if (landmark_col_start >= num_block_columns) {
833 for (
size_t l = landmark_col_start; l < num_block_columns; l++) {
834 const size_t col_start = h_block_col_pointers[l];
835 const size_t col_end = h_block_col_pointers[l + 1];
836 const size_t dim_l = graph->get_variable_dimension(l);
837 const size_t x_offset = offsets[l] - pose_dim;
839 for (
size_t ka = col_start; ka < col_end; ka++) {
840 const size_t i = h_block_row_indices[ka];
841 if (i >= landmark_col_start) {
845 const size_t dim_i = graph->get_variable_dimension(i);
847 h_hpl_vec_ops[mvd].push_back(
852 for (
auto &entry : h_hpl_vec_ops) {
853 hpl_vec_ops[entry.first] = entry.second;
857 void execute_Hpl_vector_multiply(Graph<T, S> *graph,
StreamPool &streams,
858 T *vec_out, T *vec_in) {
859 auto stream = streams.select(0);
860 const auto &offsets = graph->get_offset_vector();
862 thrust::fill(thrust::cuda::par.on(stream), vec_out, vec_out + pose_dim,
865 const S *h_values = H.get_values_ptr();
867 constexpr size_t threads_per_block = 256;
868 for (
const auto &entry : hpl_vec_ops) {
869 const auto &mvd = entry.first;
870 const auto &ops_dev = entry.second;
871 const size_t num_ops = ops_dev.size();
873 const size_t total_rows = num_ops * mvd.row;
874 const size_t blocks =
875 (total_rows + threads_per_block - 1) / threads_per_block;
876 ops::block_matvec_add_batched_kernel<T, S>
877 <<<blocks, threads_per_block, 0, stream>>>(
878 h_values, ops_dev.data().get(), num_ops, vec_in, vec_out,
883 cudaStreamSynchronize(stream);
886 void setup_b_Schur_computation(Graph<T, S> *graph,
StreamPool &streams) {
888 const auto &offsets = graph->get_offset_vector();
889 if (landmark_col_start >= offsets.size()) {
896 b_Schur.resize(pose_dim);
897 p_workspace.resize(pose_dim);
898 l_workspace.resize(landmark_dim);
901 void execute_b_Schur_computation(Graph<T, S> *graph,
StreamPool &streams,
903 auto stream = streams.select(0);
904 const auto &offsets = graph->get_offset_vector();
906 T *b = graph->get_b().data().get();
908 thrust::copy_n(thrust::cuda::par.on(stream), b, pose_dim, b_Schur);
910 execute_diagonal_inverse_multiply(graph, streams, l_workspace.data().get(),
913 execute_Hpl_vector_multiply(graph, streams, p_workspace.data().get(),
914 l_workspace.data().get());
916 ops::axpy_async(stream, pose_dim, b_Schur,
static_cast<T
>(-1.0),
917 p_workspace.data().get(), b_Schur);
919 cudaStreamSynchronize(stream);
922 void setup_HplT_vector_multiply(Graph<T, S> *graph,
StreamPool &streams) {
924 const auto &offsets = graph->get_offset_vector();
925 for (
auto &it : hplt_vec_ops) {
928 for (
auto &it : h_hplt_vec_ops) {
932 if (landmark_col_start >= offsets.size() - 1) {
936 l_workspace.resize(landmark_dim);
938 if (landmark_col_start >= num_block_columns) {
942 for (
size_t l = landmark_col_start; l < num_block_columns; l++) {
943 const size_t col_start = h_block_col_pointers[l];
944 const size_t col_end = h_block_col_pointers[l + 1];
945 const size_t dim_l = graph->get_variable_dimension(l);
946 const size_t y_offset = offsets[l] - pose_dim;
948 for (
size_t ka = col_start; ka < col_end; ka++) {
949 const size_t i = h_block_row_indices[ka];
950 if (i >= landmark_col_start) {
954 const size_t dim_i = graph->get_variable_dimension(i);
956 h_hplt_vec_ops[mvd].push_back(
961 for (
auto &entry : h_hplt_vec_ops) {
962 hplt_vec_ops[entry.first] = entry.second;
966 void execute_HplT_vector_multiply(Graph<T, S> *graph,
StreamPool &streams,
967 T *vec_out, T *vec_in) {
968 auto stream = streams.select(0);
969 const auto &offsets = graph->get_offset_vector();
970 if (landmark_col_start >= num_block_columns) {
974 thrust::fill(thrust::cuda::par_nosync.on(stream), vec_out,
975 vec_out + landmark_dim,
static_cast<T
>(0));
977 const S *h_values = H.get_values_ptr();
979 constexpr size_t threads_per_block = 256;
980 for (
const auto &entry : hplt_vec_ops) {
981 const auto &mvd = entry.first;
982 const auto &ops_dev = entry.second;
983 const size_t num_ops = ops_dev.size();
988 const size_t total_cols = num_ops * mvd.col;
989 const size_t blocks =
990 (total_cols + threads_per_block - 1) / threads_per_block;
991 ops::block_matvec_transpose_add_batched_kernel<T, S>
992 <<<blocks, threads_per_block, 0, stream>>>(
993 h_values, ops_dev.data().get(), num_ops, vec_in, vec_out, mvd.row,
997 cudaStreamSynchronize(stream);
1001 void build_diagonal_structure(Graph<T, S> *graph,
StreamPool &streams) {
1002 diagonal_indices.clear();
1003 diagonal_coords_by_dim.clear();
1006 size_t diagonal_num_values = 0;
1007 for (
size_t block_col = landmark_col_start; block_col < num_block_columns;
1010 diagonal_indices.emplace(coord, diagonal_num_values);
1011 const size_t dim = graph->get_variable_dimension(block_col);
1012 diagonal_coords_by_dim[dim].push_back(coord);
1013 diagonal_num_values += dim * dim;
1016 diagonal_values.resize(diagonal_num_values);
1017 setup_batched_inverse_buffers(streams);
1020 void setup_batched_inverse_buffers(
StreamPool &streams) {
1021 auto stream = streams.select(0);
1022 P *diag_values = diagonal_values.data().get();
1023 const P *h_values = H.get_values_ptr();
1027 A_ptrs_device_by_dim.clear();
1028 Ainv_ptrs_device_by_dim.clear();
1029 info_by_dim.clear();
1031 for (
auto &entry : diagonal_coords_by_dim) {
1032 const size_t dim = entry.first;
1033 auto &coords = entry.second;
1034 const size_t num_blocks = coords.size();
1036 auto &A_ptrs_device = A_ptrs_device_by_dim[dim];
1037 auto &Ainv_ptrs_device = Ainv_ptrs_device_by_dim[dim];
1038 auto &info = info_by_dim[dim];
1040 A_ptrs.resize(num_blocks);
1041 Ainv_ptrs.resize(num_blocks);
1042 A_ptrs_device.resize(num_blocks);
1043 Ainv_ptrs_device.resize(num_blocks);
1044 info.resize(num_blocks);
1046 for (
size_t i = 0; i < num_blocks; i++) {
1047 const auto &coord = coords[i];
1048 const size_t src_offset = H.block_indices.at(coord);
1049 const size_t dst_offset = diagonal_indices.at(coord);
1050 this->A_ptrs[i] =
const_cast<P *
>(h_values + src_offset);
1051 this->Ainv_ptrs[i] = diag_values + dst_offset;
1054 cudaMemcpyAsync(A_ptrs_device.data().get(), this->A_ptrs.data(),
1055 sizeof(P *) * num_blocks, cudaMemcpyHostToDevice, stream);
1056 cudaMemcpyAsync(Ainv_ptrs_device.data().get(), this->Ainv_ptrs.data(),
1057 sizeof(P *) * num_blocks, cudaMemcpyHostToDevice, stream);
1059 cudaStreamSynchronize(stream);
1067 void execute_block_diagonal_inversion(Graph<T, S> *graph,
1070 auto stream = streams.select(0);
1071 cublasSetStream(handle, stream);
1074 constexpr size_t max_supported_dim = 32;
1076 for (
auto &entry : diagonal_coords_by_dim) {
1077 const size_t block_dim = entry.first;
1078 if (block_dim > max_supported_dim) {
1079 std::cerr <<
"Runtime error: Schur diagonal inversion received block "
1082 <<
", but cublas matinvBatched supports at "
1084 << max_supported_dim <<
"." << std::endl;
1088 auto &A_ptrs_device = A_ptrs_device_by_dim[block_dim];
1089 auto &Ainv_ptrs_device = Ainv_ptrs_device_by_dim[block_dim];
1090 auto &info = info_by_dim[block_dim];
1091 const int num_blocks =
static_cast<int>(entry.second.size());
1100 if constexpr (std::is_same<P, double>::value) {
1101 cublasDmatinvBatched(
1102 handle,
static_cast<int>(block_dim), A_ptrs_device.data().get(),
1103 static_cast<int>(block_dim), Ainv_ptrs_device.data().get(),
1104 static_cast<int>(block_dim), info.data().get(), num_blocks);
1105 }
else if constexpr (std::is_same<P, float>::value) {
1106 cublasSmatinvBatched(
1107 handle,
static_cast<int>(block_dim), A_ptrs_device.data().get(),
1108 static_cast<int>(block_dim), Ainv_ptrs_device.data().get(),
1109 static_cast<int>(block_dim), info.data().get(), num_blocks);
1113 cudaStreamSynchronize(stream);