9template <
typename T,
typename S,
size_t N,
size_t E>
10__global__
void compute_hessian_block_kernel(
11 const size_t vi,
const size_t vj,
size_t dim_i,
size_t dim_j,
12 const size_t *active_factors,
const size_t num_active_factors,
13 const size_t *ids,
const size_t *block_offsets,
const uint8_t *vi_active,
14 const uint8_t *vj_active,
const size_t *hessian_offset_i,
15 const size_t *hessian_offset_j,
const S *jacobian_i,
const S *jacobian_j,
16 const S *precision,
const S *chi2_derivative, S *hessian) {
18 const auto idx = get_thread_id();
20 const auto block_id = idx / (dim_i * dim_j);
22 if (block_id >= num_active_factors) {
26 const auto factor_idx = active_factors[block_id];
28 const size_t vi_id = ids[factor_idx * N + vi];
29 const size_t vj_id = ids[factor_idx * N + vj];
31 if (is_vertex_active(vi_active, vi_id) &&
32 is_vertex_active(vj_active, vj_id)) {
34 const size_t block_size = dim_i * dim_j;
35 const size_t offset = idx % block_size;
39 const bool transposed = hessian_offset_i[vi_id] > hessian_offset_j[vj_id];
41 auto ji = factor_idx * E * dim_i + jacobian_i;
42 auto jj = factor_idx * E * dim_j + jacobian_j;
43 const auto precision_offset = factor_idx * E * E;
44 const auto p = precision + precision_offset;
47 cuda::std::swap(ji, jj);
48 cuda::std::swap(dim_i, dim_j);
51 const size_t row = offset % dim_i;
52 const size_t col = offset / dim_i;
60 const auto J = jj + col * E;
61 const auto Jt = ji + row * E;
63 for (
int i = 0; i < E; i++) {
66 for (
int j = 0; j < E; j++) {
67 pj += (highp)p[i * E + j] * (highp)J[j];
69 value += (highp)Jt[i] * pj;
72 value *= (highp)chi2_derivative[factor_idx];
74 auto block = hessian + (block_offsets[block_id] + (row + col * dim_i));
75 S lp_value =
static_cast<S
>(value);
76 atomicAdd(block, lp_value);
80template <
typename S,
int D>
81__global__
void augment_hessian_diagonal_kernel(S *diagonal_blocks,
82 S *scalar_diagonal,
const S mu,
83 const bool use_identity,
84 const uint8_t *active_state,
85 const size_t num_threads) {
86 const size_t idx = get_thread_id();
88 if (idx >= num_threads) {
92 constexpr auto block_size = D * D;
94 const auto vertex_id = idx;
95 if (!is_vertex_active(active_state, vertex_id)) {
99 S *block = diagonal_blocks + vertex_id * block_size;
100 for (
size_t i = 0; i < D; i++) {
101 const double diag =
static_cast<double>(scalar_diagonal[vertex_id * D + i]);
102 double new_diag = diag;
104 new_diag +=
static_cast<double>(mu);
106 new_diag +=
static_cast<double>(mu) * std::clamp(diag, 1.0e-6, 1.0e32);
108 block[i * D + i] =
static_cast<S
>(new_diag);
112template <
typename T,
typename S,
typename V>
113void augment_block_diagonal(V *v, InvP<T, S> *block_diagonal,
114 InvP<T, S> *scalar_diagonal,
const T mu,
115 const bool use_identity, cudaStream_t stream) {
116 const size_t num_threads = v->count();
117 const auto threads_per_block = 256;
118 const auto num_blocks =
119 (num_threads + threads_per_block - 1) / threads_per_block;
121 augment_hessian_diagonal_kernel<InvP<T, S>, V::dim>
122 <<<num_blocks, threads_per_block, 0, stream>>>(
123 block_diagonal, scalar_diagonal, (InvP<T, S>)mu, use_identity,
124 v->get_active_state(), num_threads);
127template <
typename T,
typename S,
int D>
128__global__
void apply_block_jacobi_kernel(T *z,
const T *r, S *block_diagonal,
129 const size_t *hessian_ids,
130 const uint8_t *active_state,
131 const size_t num_threads) {
132 const size_t idx = get_thread_id();
133 const auto local_vertex_id = idx / D;
135 if (idx >= num_threads || !is_vertex_active(active_state, local_vertex_id)) {
139 constexpr auto block_size = D * D;
141 S *block = block_diagonal + local_vertex_id * block_size;
142 const auto hessian_offset = hessian_ids[local_vertex_id];
143 const auto offset = idx % D;
144 const auto row = offset;
148 for (
size_t i = 0; i < D; i++) {
149 value += (T)block[row + i * D] * r[hessian_offset + i];
151 z[hessian_offset + row] = value;
154template <
typename T,
typename S,
typename V>
155void apply_block_jacobi(V *v, T *z,
const T *r, InvP<T, S> *block_diagonal,
156 cudaStream_t stream) {
157 const size_t num_parameters = v->count() * v->dimension();
158 const size_t num_threads = num_parameters;
159 const auto threads_per_block = 256;
160 const auto num_blocks =
161 (num_threads + threads_per_block - 1) / threads_per_block;
163 apply_block_jacobi_kernel<T, InvP<T, S>, V::dim>
164 <<<num_blocks, threads_per_block, 0, stream>>>(
165 z, r, block_diagonal, v->get_hessian_ids(), v->get_active_state(),
169template <
typename highp,
typename InvP,
typename T,
size_t I,
size_t N,
171__global__
void compute_hessian_diagonal_kernel(
172 InvP *diagonal_blocks,
const T *jacs,
const size_t *active_ids,
173 const size_t *ids,
const uint8_t *active_state,
const T *pmat,
174 const T *chi2_derivative,
const size_t num_threads) {
175 const size_t idx = get_thread_id();
177 if (idx >= num_threads) {
181 constexpr size_t jacobian_size = D * E;
182 constexpr size_t block_size = D * D;
186 const size_t factor_id = active_ids[idx / block_size];
187 const size_t local_id =
190 if (!is_vertex_active(active_state, local_id)) {
193 const auto jacobian_offset = factor_id * jacobian_size;
195 constexpr auto precision_matrix_size = E * E;
196 const auto precision_offset = factor_id * precision_matrix_size;
202 const size_t offset = idx % block_size;
203 const size_t row = offset % D;
204 const size_t col = offset / D;
208 const T *Jt = jacs + jacobian_offset + row * E;
209 const T *J = jacs + jacobian_offset + col * E;
211 const T *precision_matrix = pmat + precision_offset;
215 for (
int i = 0; i < E; i++) {
218 for (
int j = 0; j < E; j++) {
219 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
221 value += (highp)Jt[i] * pj;
234 value *= (highp)chi2_derivative[factor_id];
249 InvP *block = diagonal_blocks + (local_id * block_size + row + col * D);
264 InvP lp_value =
static_cast<InvP
>(value);
265 atomicAdd(block, lp_value);
270template <
typename highp,
typename InvP,
typename T,
size_t I,
size_t N,
271 size_t E,
size_t D,
typename VT,
typename F>
272__global__
void compute_hessian_diagonal_dynamic_kernel(
273 InvP *diagonal_blocks,
const size_t *active_ids,
const size_t *ids,
274 const size_t *hessian_ids,
const VT args,
275 const typename F::ObservationType *obs,
const highp *jacobian_scales,
276 const typename F::ConstraintDataType *constraint_data,
277 const uint8_t *active_state,
const T *pmat,
const T *chi2_derivative,
278 const size_t num_threads) {
279 const size_t idx = get_thread_id();
281 if (idx >= num_threads) {
285 constexpr size_t jacobian_size = D * E;
286 constexpr size_t block_size = D * D;
290 const size_t factor_id = active_ids[idx / block_size];
291 const size_t local_id =
294 if (!is_vertex_active(active_state, local_id)) {
298 constexpr auto precision_matrix_size = E * E;
299 const auto precision_offset = factor_id * precision_matrix_size;
303 const size_t offset = idx % block_size;
304 const size_t row = offset % D;
305 const size_t col = offset / D;
307 highp jacobian[jacobian_size];
309 compute_Jblock<highp, I, N, typename F::ObservationType, E, F, VT>(
310 jacobian, factor_id, local_id, obs, constraint_data, ids, hessian_ids,
311 args, std::make_index_sequence<N>{});
313 const auto hessian_offset = hessian_ids[local_id];
315 const highp *Jt = jacobian + row * E;
316 const highp *J = jacobian + col * E;
318 const T *precision_matrix = pmat + precision_offset;
320 const highp *jscale = jacobian_scales + hessian_offset;
324 for (
int i = 0; i < E; i++) {
327 for (
int j = 0; j < E; j++) {
328 pj += (highp)precision_matrix[i * E + j] * (highp)J[j] * jscale[col];
330 value += (highp)Jt[i] * jscale[row] * pj;
333 value *= (highp)chi2_derivative[factor_id];
335 InvP *block = diagonal_blocks + (local_id * block_size + row + col * D);
337 InvP lp_value =
static_cast<InvP
>(value);
338 atomicAdd(block, lp_value);
341template <
typename T,
typename S,
typename F, std::size_t... Is>
342void launch_kernel_block_diagonal(
343 F *f, std::array<InvP<T, S> *, F::get_num_vertices()> &diagonal_blocks,
344 std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
345 std::array<S *, F::get_num_vertices()> &jacs,
const T *jacobian_scales,
346 const size_t num_factors, cudaStream_t stream, std::index_sequence<Is...>) {
348 constexpr size_t num_vertices = F::get_num_vertices();
349 constexpr size_t dimension = F::get_vertex_sizes()[Is];
350 const size_t num_threads = num_factors * dimension * dimension;
355 size_t threads_per_block = 256;
357 (num_threads + threads_per_block - 1) / threads_per_block;
364 if (f->store_jacobians() || !is_analytical<F>()) {
365 compute_hessian_diagonal_kernel<T, InvP<T, S>, S, Is, num_vertices,
366 F::error_dim, dimension>
367 <<<num_blocks, threads_per_block, 0, stream>>>(
368 diagonal_blocks[Is], jacs[Is], f->active_indices.data().get(),
369 f->device_ids.data().get(),
370 f->vertex_descriptors[Is]->get_active_state(),
371 f->precision_matrices.data().get(),
372 f->chi2_derivative.data().get(), num_threads);
374 if constexpr (is_analytical<F>()) {
375 compute_hessian_diagonal_dynamic_kernel<
376 T, InvP<T, S>, S, Is, num_vertices, F::error_dim, dimension,
377 typename F::VertexPointerPointerTuple, F>
378 <<<num_blocks, threads_per_block, 0, stream>>>(
379 diagonal_blocks[Is], f->active_indices.data().get(),
380 f->device_ids.data().get(), hessian_ids[Is], f->get_vertices(),
381 f->device_obs.data().get(), jacobian_scales,
382 f->data.data().get(),
383 f->vertex_descriptors[Is]->get_active_state(),
384 f->precision_matrices.data().get(),
385 f->chi2_derivative.data().get(), num_threads);
392template <
typename T,
typename S,
typename F>
393void compute_block_diagonal(
394 F *f, std::array<InvP<T, S> *, F::get_num_vertices()> &diagonal_blocks,
395 const T *jacobian_scales, cudaStream_t stream) {
398 constexpr auto num_vertices = F::get_num_vertices();
399 constexpr auto vertex_sizes = F::get_vertex_sizes();
403 auto verts = f->get_vertices();
404 std::array<S *, num_vertices> jacs;
405 std::array<const size_t *, num_vertices> hessian_ids;
406 for (
int i = 0; i < num_vertices; i++) {
407 jacs[i] = f->jacobians[i].data.data().get();
408 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
411 const auto num_factors = f->active_count();
413 launch_kernel_block_diagonal<T, S>(f, diagonal_blocks, hessian_ids, jacs,
414 jacobian_scales, num_factors, stream,
415 std::make_index_sequence<num_vertices>{});
418template <
typename highp,
typename T,
size_t I,
size_t N,
size_t E,
size_t D>
419__global__
void compute_hessian_scalar_diagonal_kernel(
420 highp *diagonal,
const T *jacs,
const size_t *active_ids,
const size_t *ids,
421 const size_t *hessian_ids,
const uint8_t *active_state,
const T *pmat,
422 const T *chi2_derivative,
const size_t num_threads) {
423 const size_t idx = get_thread_id();
425 if (idx >= num_threads) {
429 constexpr size_t jacobian_size = D * E;
433 const size_t factor_id = active_ids[idx / D];
434 const size_t local_id =
437 if (!is_vertex_active(active_state, local_id)) {
440 const auto jacobian_offset = factor_id * jacobian_size;
442 constexpr auto precision_matrix_size = E * E;
443 const auto precision_offset = factor_id * precision_matrix_size;
446 const size_t row = idx % D;
447 const size_t col = row;
451 const T *Jt = jacs + jacobian_offset + row * E;
452 const T *J = jacs + jacobian_offset + col * E;
454 const T *precision_matrix = pmat + precision_offset;
458 for (
int i = 0; i < E; i++) {
461 for (
int j = 0; j < E; j++) {
462 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
464 value += (highp)Jt[i] * pj;
467 value *= (highp)chi2_derivative[factor_id];
470 const size_t hessian_offset = hessian_ids[local_id];
471 highp *location = diagonal + hessian_offset + row;
473 atomicAdd(location, value);
476template <
typename highp,
typename T,
size_t I,
size_t N,
size_t E,
size_t D,
477 typename VT,
typename F,
bool use_scales>
478__global__
void compute_hessian_scalar_diagonal_dynamic_kernel(
479 highp *diagonal,
const T *jacs,
const size_t *active_ids,
const size_t *ids,
480 const size_t *hessian_ids,
const VT args,
481 const typename F::ObservationType *obs,
const highp *jacobian_scales,
482 const typename F::ConstraintDataType *constraint_data,
483 const uint8_t *active_state,
const T *pmat,
const T *chi2_derivative,
484 const size_t num_threads) {
485 const size_t idx = get_thread_id();
487 if (idx >= num_threads) {
491 constexpr size_t jacobian_size = D * E;
495 const size_t factor_id = active_ids[idx / D];
496 const size_t local_id =
499 if (!is_vertex_active(active_state, local_id)) {
503 constexpr auto precision_matrix_size = E * E;
504 const auto precision_offset = factor_id * precision_matrix_size;
507 const size_t row = idx % D;
508 const size_t col = row;
512 highp jacobian[jacobian_size];
514 compute_Jblock<highp, I, N, typename F::ObservationType, E, F, VT>(
515 jacobian, factor_id, local_id, obs, constraint_data, ids, hessian_ids,
516 args, std::make_index_sequence<N>{});
517 const auto hessian_offset = hessian_ids[local_id];
518 const highp *Jt = jacobian + row * E;
519 const highp *J = jacobian + col * E;
521 const T *precision_matrix = pmat + precision_offset;
523 if constexpr (use_scales) {
524 const highp *jscale = jacobian_scales + hessian_offset;
527 for (
int i = 0; i < E; i++) {
530 for (
int j = 0; j < E; j++) {
531 pj += (highp)precision_matrix[i * E + j] * (highp)J[j] * jscale[col];
533 value += (highp)Jt[i] * jscale[row] * pj;
538 for (
int i = 0; i < E; i++) {
541 for (
int j = 0; j < E; j++) {
542 pj += (highp)precision_matrix[i * E + j] * (highp)J[j];
544 value += (highp)Jt[i] * pj;
548 value *= (highp)chi2_derivative[factor_id];
551 highp *location = diagonal + hessian_offset + row;
553 atomicAdd(location, value);
556template <
typename T,
typename S,
typename F, std::size_t... Is>
557void launch_kernel_hessian_scalar_diagonal(
559 std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
560 std::array<S *, F::get_num_vertices()> &jacs,
const T *jacobian_scales,
561 const size_t num_factors, std::index_sequence<Is...>) {
563 constexpr size_t num_vertices = F::get_num_vertices();
564 constexpr size_t dimension = F::get_vertex_sizes()[Is];
565 const size_t num_threads = num_factors * dimension;
567 size_t threads_per_block = 256;
569 (num_threads + threads_per_block - 1) / threads_per_block;
571 if (f->store_jacobians() || !is_analytical<F>()) {
572 compute_hessian_scalar_diagonal_kernel<T, S, Is, num_vertices,
573 F::error_dim, dimension>
574 <<<num_blocks, threads_per_block>>>(
575 diagonal, jacs[Is], f->active_indices.data().get(),
576 f->device_ids.data().get(), hessian_ids[Is],
577 f->vertex_descriptors[Is]->get_active_state(),
578 f->precision_matrices.data().get(),
579 f->chi2_derivative.data().get(), num_threads);
581 if constexpr (is_analytical<F>()) {
582 if (jacobian_scales ==
nullptr) {
583 compute_hessian_scalar_diagonal_dynamic_kernel<
584 T, S, Is, num_vertices, F::error_dim, dimension,
585 typename F::VertexPointerPointerTuple, F,
false>
586 <<<num_blocks, threads_per_block>>>(
587 diagonal, jacs[Is], f->active_indices.data().get(),
588 f->device_ids.data().get(), hessian_ids[Is],
589 f->get_vertices(), f->device_obs.data().get(),
nullptr,
590 f->data.data().get(),
591 f->vertex_descriptors[Is]->get_active_state(),
592 f->precision_matrices.data().get(),
593 f->chi2_derivative.data().get(), num_threads);
595 compute_hessian_scalar_diagonal_dynamic_kernel<
596 T, S, Is, num_vertices, F::error_dim, dimension,
597 typename F::VertexPointerPointerTuple, F,
true>
598 <<<num_blocks, threads_per_block>>>(
599 diagonal, jacs[Is], f->active_indices.data().get(),
600 f->device_ids.data().get(), hessian_ids[Is],
601 f->get_vertices(), f->device_obs.data().get(),
602 jacobian_scales, f->data.data().get(),
603 f->vertex_descriptors[Is]->get_active_state(),
604 f->precision_matrices.data().get(),
605 f->chi2_derivative.data().get(), num_threads);
613template <
typename T,
typename S,
typename F>
614void compute_hessian_scalar_diagonal(F *f, T *diagonal,
615 const T *jacobian_scales) {
618 constexpr auto num_vertices = F::get_num_vertices();
619 constexpr auto vertex_sizes = F::get_vertex_sizes();
623 auto verts = f->get_vertices();
624 std::array<S *, num_vertices> jacs;
625 std::array<const size_t *, num_vertices> hessian_ids;
626 for (
int i = 0; i < num_vertices; i++) {
627 jacs[i] = f->jacobians[i].data.data().get();
628 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
631 const auto num_factors = f->active_count();
633 launch_kernel_hessian_scalar_diagonal<T, S>(
634 f, diagonal, hessian_ids, jacs, jacobian_scales, num_factors,
635 std::make_index_sequence<num_vertices>{});
The top-level namespace for Graphite.
Definition eigen_solver.cpp:4