9template <
typename T,
typename S,
size_t N,
size_t E>
10__global__
void compute_hessian_block_kernel(
11 const size_t vi,
const size_t vj,
size_t dim_i,
size_t dim_j,
12 const size_t *active_factors,
const size_t num_active_factors,
13 const size_t *ids,
const size_t *block_offsets,
const uint8_t *vi_active,
14 const uint8_t *vj_active,
const size_t *hessian_offset_i,
15 const size_t *hessian_offset_j,
const S *jacobian_i,
const S *jacobian_j,
16 const S *precision,
const S *chi2_derivative, S *hessian) {
18 const auto idx = get_thread_id();
20 const auto block_id = idx / (dim_i * dim_j);
22 if (block_id >= num_active_factors) {
26 const auto factor_idx = active_factors[block_id];
28 const size_t vi_id = ids[factor_idx * N + vi];
29 const size_t vj_id = ids[factor_idx * N + vj];
31 if (is_vertex_active(vi_active, vi_id) &&
32 is_vertex_active(vj_active, vj_id)) {
34 const size_t block_size = dim_i * dim_j;
35 const size_t offset = idx % block_size;
39 const bool transposed = hessian_offset_i[vi_id] > hessian_offset_j[vj_id];
41 auto ji = factor_idx * E * dim_i + jacobian_i;
42 auto jj = factor_idx * E * dim_j + jacobian_j;
43 const auto precision_offset = factor_idx * E * E;
44 const auto p = precision + precision_offset;
47 cuda::std::swap(ji, jj);
48 cuda::std::swap(dim_i, dim_j);
51 const size_t row = offset % dim_i;
52 const size_t col = offset / dim_i;
60 const auto J = jj + col * E;
61 const auto Jt = ji + row * E;
63 for (
int i = 0; i < E; i++) {
66 for (
int j = 0; j < E; j++) {
67 pj += (highp)p[i * E + j] * (highp)J[j];
69 value += (highp)Jt[i] * pj;
72 value *= (highp)chi2_derivative[factor_idx];
74 auto block = hessian + (block_offsets[block_id] + (row + col * dim_i));
75 S lp_value =
static_cast<S
>(value);
76 atomicAdd(block, lp_value);
80template <
typename S,
int D>
81__global__
void augment_hessian_diagonal_kernel(S *diagonal_blocks,
82 S *scalar_diagonal,
const S mu,
83 const uint8_t *active_state,
84 const size_t num_threads) {
85 const size_t idx = get_thread_id();
87 if (idx >= num_threads) {
91 constexpr auto block_size = D * D;
93 const auto vertex_id = idx;
94 if (!is_vertex_active(active_state, vertex_id)) {
98 S *block = diagonal_blocks + vertex_id * block_size;
99 for (
size_t i = 0; i < D; i++) {
102 const double diag =
static_cast<double>(scalar_diagonal[vertex_id * D + i]);
103 const double new_diag =
104 diag +
static_cast<double>(mu) * std::clamp(diag, 1.0e-6, 1.0e32);
107 block[i * D + i] =
static_cast<S
>(new_diag);
111template <
typename T,
typename S,
typename V>
112void augment_block_diagonal(V *v, InvP<T, S> *block_diagonal,
113 InvP<T, S> *scalar_diagonal, T mu,
114 cudaStream_t stream) {
115 const size_t num_threads = v->count();
116 const auto threads_per_block = 256;
117 const auto num_blocks =
118 (num_threads + threads_per_block - 1) / threads_per_block;
120 augment_hessian_diagonal_kernel<InvP<T, S>, V::dim>
121 <<<num_blocks, threads_per_block, 0, stream>>>(
122 block_diagonal, scalar_diagonal, (InvP<T, S>)mu,
123 v->get_active_state(), num_threads);
126template <
typename T,
typename S,
int D>
127__global__
void apply_block_jacobi_kernel(T *z,
const T *r, S *block_diagonal,
128 const size_t *hessian_ids,
129 const uint8_t *active_state,
130 const size_t num_threads) {
131 const size_t idx = get_thread_id();
132 const auto local_vertex_id = idx / D;
134 if (idx >= num_threads || !is_vertex_active(active_state, local_vertex_id)) {
138 constexpr auto block_size = D * D;
140 S *block = block_diagonal + local_vertex_id * block_size;
141 const auto hessian_offset = hessian_ids[local_vertex_id];
142 const auto offset = idx % D;
143 const auto row = offset;
147 for (
size_t i = 0; i < D; i++) {
148 value += (T)block[row + i * D] * r[hessian_offset + i];
150 z[hessian_offset + row] = value;
153template <
typename T,
typename S,
typename V>
154void apply_block_jacobi(V *v, T *z,
const T *r, InvP<T, S> *block_diagonal,
155 cudaStream_t stream) {
156 const size_t num_parameters = v->count() * v->dimension();
157 const size_t num_threads = num_parameters;
158 const auto threads_per_block = 256;
159 const auto num_blocks =
160 (num_threads + threads_per_block - 1) / threads_per_block;
162 apply_block_jacobi_kernel<T, InvP<T, S>, V::dim>
163 <<<num_blocks, threads_per_block, 0, stream>>>(
164 z, r, block_diagonal, v->get_hessian_ids(), v->get_active_state(),
168template <
typename highp,
typename InvP,
typename T,
size_t I,
size_t N,
170__global__
void compute_hessian_diagonal_kernel(
171 InvP *diagonal_blocks,
const T *jacs,
const size_t *active_ids,
172 const size_t *ids,
const uint8_t *active_state,
const T *pmat,
173 const T *chi2_derivative,
const size_t num_threads) {
174 const size_t idx = get_thread_id();
176 if (idx >= num_threads) {
180 constexpr size_t jacobian_size = D * E;
181 constexpr size_t block_size = D * D;
185 const size_t factor_id = active_ids[idx / block_size];
186 const size_t local_id =
189 if (!is_vertex_active(active_state, local_id)) {
192 const auto jacobian_offset = factor_id * jacobian_size;
194 constexpr auto precision_matrix_size = E * E;
195 const auto precision_offset = factor_id * precision_matrix_size;
201 const size_t offset = idx % block_size;
202 const size_t row = offset % D;
203 const size_t col = offset / D;
207 const T *Jt = jacs + jacobian_offset + row * E;
208 const T *J = jacs + jacobian_offset + col * E;
210 const T *precision_matrix = pmat + precision_offset;
214 for (
int i = 0; i < E; i++) {
217 for (
int j = 0; j < E; j++) {
218 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
220 value += (highp)Jt[i] * pj;
233 value *= (highp)chi2_derivative[factor_id];
248 InvP *block = diagonal_blocks + (local_id * block_size + row + col * D);
263 InvP lp_value =
static_cast<InvP
>(value);
264 atomicAdd(block, lp_value);
269template <
typename highp,
typename InvP,
typename T,
size_t I,
size_t N,
270 size_t E,
size_t D,
typename VT,
typename F>
271__global__
void compute_hessian_diagonal_dynamic_kernel(
272 InvP *diagonal_blocks,
const size_t *active_ids,
const size_t *ids,
273 const size_t *hessian_ids,
const VT args,
274 const typename F::ObservationType *obs,
const highp *jacobian_scales,
275 const typename F::ConstraintDataType *constraint_data,
276 const uint8_t *active_state,
const T *pmat,
const T *chi2_derivative,
277 const size_t num_threads) {
278 const size_t idx = get_thread_id();
280 if (idx >= num_threads) {
284 constexpr size_t jacobian_size = D * E;
285 constexpr size_t block_size = D * D;
289 const size_t factor_id = active_ids[idx / block_size];
290 const size_t local_id =
293 if (!is_vertex_active(active_state, local_id)) {
297 constexpr auto precision_matrix_size = E * E;
298 const auto precision_offset = factor_id * precision_matrix_size;
302 const size_t offset = idx % block_size;
303 const size_t row = offset % D;
304 const size_t col = offset / D;
306 highp jacobian[jacobian_size];
308 compute_Jblock<highp, I, N, typename F::ObservationType, E, F, VT>(
309 jacobian, factor_id, local_id, obs, constraint_data, ids, hessian_ids,
310 args, std::make_index_sequence<N>{});
312 const auto hessian_offset = hessian_ids[local_id];
314 const highp *Jt = jacobian + row * E;
315 const highp *J = jacobian + col * E;
317 const T *precision_matrix = pmat + precision_offset;
319 const highp *jscale = jacobian_scales + hessian_offset;
323 for (
int i = 0; i < E; i++) {
326 for (
int j = 0; j < E; j++) {
327 pj += (highp)precision_matrix[i * E + j] * (highp)J[j] * jscale[col];
329 value += (highp)Jt[i] * jscale[row] * pj;
332 value *= (highp)chi2_derivative[factor_id];
334 InvP *block = diagonal_blocks + (local_id * block_size + row + col * D);
336 InvP lp_value =
static_cast<InvP
>(value);
337 atomicAdd(block, lp_value);
340template <
typename T,
typename S,
typename F, std::size_t... Is>
341void launch_kernel_block_diagonal(
342 F *f, std::array<InvP<T, S> *, F::get_num_vertices()> &diagonal_blocks,
343 std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
344 std::array<S *, F::get_num_vertices()> &jacs,
const T *jacobian_scales,
345 const size_t num_factors, cudaStream_t stream, std::index_sequence<Is...>) {
347 constexpr size_t num_vertices = F::get_num_vertices();
348 constexpr size_t dimension = F::get_vertex_sizes()[Is];
349 const size_t num_threads = num_factors * dimension * dimension;
354 size_t threads_per_block = 256;
356 (num_threads + threads_per_block - 1) / threads_per_block;
363 if (f->store_jacobians() || !is_analytical<F>()) {
364 compute_hessian_diagonal_kernel<T, InvP<T, S>, S, Is, num_vertices,
365 F::error_dim, dimension>
366 <<<num_blocks, threads_per_block, 0, stream>>>(
367 diagonal_blocks[Is], jacs[Is], f->active_indices.data().get(),
368 f->device_ids.data().get(),
369 f->vertex_descriptors[Is]->get_active_state(),
370 f->precision_matrices.data().get(),
371 f->chi2_derivative.data().get(), num_threads);
373 if constexpr (is_analytical<F>()) {
374 compute_hessian_diagonal_dynamic_kernel<
375 T, InvP<T, S>, S, Is, num_vertices, F::error_dim, dimension,
376 typename F::VertexPointerPointerTuple, F>
377 <<<num_blocks, threads_per_block, 0, stream>>>(
378 diagonal_blocks[Is], f->active_indices.data().get(),
379 f->device_ids.data().get(), hessian_ids[Is], f->get_vertices(),
380 f->device_obs.data().get(), jacobian_scales,
381 f->data.data().get(),
382 f->vertex_descriptors[Is]->get_active_state(),
383 f->precision_matrices.data().get(),
384 f->chi2_derivative.data().get(), num_threads);
391template <
typename T,
typename S,
typename F>
392void compute_block_diagonal(
393 F *f, std::array<InvP<T, S> *, F::get_num_vertices()> &diagonal_blocks,
394 const T *jacobian_scales, cudaStream_t stream) {
397 constexpr auto num_vertices = F::get_num_vertices();
398 constexpr auto vertex_sizes = F::get_vertex_sizes();
402 auto verts = f->get_vertices();
403 std::array<S *, num_vertices> jacs;
404 std::array<const size_t *, num_vertices> hessian_ids;
405 for (
int i = 0; i < num_vertices; i++) {
406 jacs[i] = f->jacobians[i].data.data().get();
407 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
410 const auto num_factors = f->active_count();
412 launch_kernel_block_diagonal<T, S>(f, diagonal_blocks, hessian_ids, jacs,
413 jacobian_scales, num_factors, stream,
414 std::make_index_sequence<num_vertices>{});
417template <
typename highp,
typename T,
size_t I,
size_t N,
size_t E,
size_t D>
418__global__
void compute_hessian_scalar_diagonal_kernel(
419 highp *diagonal,
const T *jacs,
const size_t *active_ids,
const size_t *ids,
420 const size_t *hessian_ids,
const uint8_t *active_state,
const T *pmat,
421 const T *chi2_derivative,
const size_t num_threads) {
422 const size_t idx = get_thread_id();
424 if (idx >= num_threads) {
428 constexpr size_t jacobian_size = D * E;
432 const size_t factor_id = active_ids[idx / D];
433 const size_t local_id =
436 if (!is_vertex_active(active_state, local_id)) {
439 const auto jacobian_offset = factor_id * jacobian_size;
441 constexpr auto precision_matrix_size = E * E;
442 const auto precision_offset = factor_id * precision_matrix_size;
445 const size_t row = idx % D;
446 const size_t col = row;
450 const T *Jt = jacs + jacobian_offset + row * E;
451 const T *J = jacs + jacobian_offset + col * E;
453 const T *precision_matrix = pmat + precision_offset;
457 for (
int i = 0; i < E; i++) {
460 for (
int j = 0; j < E; j++) {
461 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
463 value += (highp)Jt[i] * pj;
466 value *= (highp)chi2_derivative[factor_id];
469 const size_t hessian_offset = hessian_ids[local_id];
470 highp *location = diagonal + hessian_offset + row;
472 atomicAdd(location, value);
475template <
typename highp,
typename T,
size_t I,
size_t N,
size_t E,
size_t D,
476 typename VT,
typename F,
bool use_scales>
477__global__
void compute_hessian_scalar_diagonal_dynamic_kernel(
478 highp *diagonal,
const T *jacs,
const size_t *active_ids,
const size_t *ids,
479 const size_t *hessian_ids,
const VT args,
480 const typename F::ObservationType *obs,
const highp *jacobian_scales,
481 const typename F::ConstraintDataType *constraint_data,
482 const uint8_t *active_state,
const T *pmat,
const T *chi2_derivative,
483 const size_t num_threads) {
484 const size_t idx = get_thread_id();
486 if (idx >= num_threads) {
490 constexpr size_t jacobian_size = D * E;
494 const size_t factor_id = active_ids[idx / D];
495 const size_t local_id =
498 if (!is_vertex_active(active_state, local_id)) {
502 constexpr auto precision_matrix_size = E * E;
503 const auto precision_offset = factor_id * precision_matrix_size;
506 const size_t row = idx % D;
507 const size_t col = row;
511 highp jacobian[jacobian_size];
513 compute_Jblock<highp, I, N, typename F::ObservationType, E, F, VT>(
514 jacobian, factor_id, local_id, obs, constraint_data, ids, hessian_ids,
515 args, std::make_index_sequence<N>{});
516 const auto hessian_offset = hessian_ids[local_id];
517 const highp *Jt = jacobian + row * E;
518 const highp *J = jacobian + col * E;
520 const T *precision_matrix = pmat + precision_offset;
522 if constexpr (use_scales) {
523 const highp *jscale = jacobian_scales + hessian_offset;
526 for (
int i = 0; i < E; i++) {
529 for (
int j = 0; j < E; j++) {
530 pj += (highp)precision_matrix[i * E + j] * (highp)J[j] * jscale[col];
532 value += (highp)Jt[i] * jscale[row] * pj;
537 for (
int i = 0; i < E; i++) {
540 for (
int j = 0; j < E; j++) {
541 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
543 value += (highp)Jt[i] * pj;
547 value *= (highp)chi2_derivative[factor_id];
550 highp *location = diagonal + hessian_offset + row;
552 atomicAdd(location, value);
555template <
typename T,
typename S,
typename F, std::size_t... Is>
556void launch_kernel_hessian_scalar_diagonal(
558 std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
559 std::array<S *, F::get_num_vertices()> &jacs,
const T *jacobian_scales,
560 const size_t num_factors, std::index_sequence<Is...>) {
562 constexpr size_t num_vertices = F::get_num_vertices();
563 constexpr size_t dimension = F::get_vertex_sizes()[Is];
564 const size_t num_threads = num_factors * dimension;
566 size_t threads_per_block = 256;
568 (num_threads + threads_per_block - 1) / threads_per_block;
570 if (f->store_jacobians() || !is_analytical<F>()) {
571 compute_hessian_scalar_diagonal_kernel<T, S, Is, num_vertices,
572 F::error_dim, dimension>
573 <<<num_blocks, threads_per_block>>>(
574 diagonal, jacs[Is], f->active_indices.data().get(),
575 f->device_ids.data().get(), hessian_ids[Is],
576 f->vertex_descriptors[Is]->get_active_state(),
577 f->precision_matrices.data().get(),
578 f->chi2_derivative.data().get(), num_threads);
580 if constexpr (is_analytical<F>()) {
581 if (jacobian_scales ==
nullptr) {
582 compute_hessian_scalar_diagonal_dynamic_kernel<
583 T, S, Is, num_vertices, F::error_dim, dimension,
584 typename F::VertexPointerPointerTuple, F,
false>
585 <<<num_blocks, threads_per_block>>>(
586 diagonal, jacs[Is], f->active_indices.data().get(),
587 f->device_ids.data().get(), hessian_ids[Is],
588 f->get_vertices(), f->device_obs.data().get(),
nullptr,
589 f->data.data().get(),
590 f->vertex_descriptors[Is]->get_active_state(),
591 f->precision_matrices.data().get(),
592 f->chi2_derivative.data().get(), num_threads);
594 compute_hessian_scalar_diagonal_dynamic_kernel<
595 T, S, Is, num_vertices, F::error_dim, dimension,
596 typename F::VertexPointerPointerTuple, F,
true>
597 <<<num_blocks, threads_per_block>>>(
598 diagonal, jacs[Is], f->active_indices.data().get(),
599 f->device_ids.data().get(), hessian_ids[Is],
600 f->get_vertices(), f->device_obs.data().get(),
601 jacobian_scales, f->data.data().get(),
602 f->vertex_descriptors[Is]->get_active_state(),
603 f->precision_matrices.data().get(),
604 f->chi2_derivative.data().get(), num_threads);
612template <
typename T,
typename S,
typename F>
613void compute_hessian_scalar_diagonal(F *f, T *diagonal,
614 const T *jacobian_scales) {
617 constexpr auto num_vertices = F::get_num_vertices();
618 constexpr auto vertex_sizes = F::get_vertex_sizes();
622 auto verts = f->get_vertices();
623 std::array<S *, num_vertices> jacs;
624 std::array<const size_t *, num_vertices> hessian_ids;
625 for (
int i = 0; i < num_vertices; i++) {
626 jacs[i] = f->jacobians[i].data.data().get();
627 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
630 const auto num_factors = f->active_count();
632 launch_kernel_hessian_scalar_diagonal<T, S>(
633 f, diagonal, hessian_ids, jacs, jacobian_scales, num_factors,
634 std::make_index_sequence<num_vertices>{});