8template <
typename T,
typename S,
size_t I,
size_t N,
typename M,
size_t E,
9 typename F,
typename VT, std::size_t... Is>
10__global__
void compute_jacobian_kernel(
11 const M *obs, T *error, S *jacs,
12 const typename F::ConstraintDataType *constraint_data,
13 const size_t *active_ids,
const size_t *ids,
const size_t num_threads,
14 const VT args,
const uint8_t *active_state, std::index_sequence<Is...>) {
15 const size_t idx = get_thread_id();
17 if (idx >= num_threads) {
21 constexpr auto vertex_sizes = F::get_vertex_sizes();
22 const auto factor_id = active_ids[idx];
23 const size_t vertex_id = ids[factor_id * N + I];
24 if (!is_vertex_active(active_state, vertex_id)) {
28 const M *local_obs = obs + factor_id;
30 const typename F::ConstraintDataType *local_data =
31 constraint_data + factor_id;
33 auto vargs = cuda::std::make_tuple(
34 (*(std::get<Is>(args) + ids[factor_id * N + Is]))...);
36 constexpr size_t jacobian_size = E * vertex_sizes[I];
38 S *j = jacs + factor_id * jacobian_size;
39 using DataType =
typename F::ConstraintDataType;
40 using ObsType =
typename F::ObservationType;
43 if constexpr (!std::is_same<T, S>::value) {
44 T jacobian[jacobian_size];
46 if constexpr (std::is_same<Empty, ObsType>::value &&
47 std::is_same<Empty, DataType>::value) {
48 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
50 }
else if constexpr (std::is_same<Empty, DataType>::value) {
51 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
52 *local_obs, jacobian);
53 }
else if constexpr (std::is_same<Empty, ObsType>::value) {
54 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
55 *local_data, jacobian);
57 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
58 *local_obs, *local_data, jacobian);
62 for (
size_t i = 0; i < jacobian_size; ++i) {
66 if constexpr (std::is_same<Empty, ObsType>::value &&
67 std::is_same<Empty, DataType>::value) {
68 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))..., j);
69 }
else if constexpr (std::is_same<Empty, DataType>::value) {
70 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
72 }
else if constexpr (std::is_same<Empty, ObsType>::value) {
73 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
76 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
77 *local_obs, *local_data, j);
82template <
typename T,
typename S,
typename F,
typename VT, std::size_t... Is>
83void launch_kernel_jacobians(
84 F *f, std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
85 VT &verts, std::array<S *, F::get_num_vertices()> &jacs,
86 const size_t num_factors, StreamPool &streams, std::index_sequence<Is...>) {
88 constexpr auto num_vertices = F::get_num_vertices();
89 const auto num_threads = num_factors;
90 size_t threads_per_block = 256;
92 (num_threads + threads_per_block - 1) / threads_per_block;
94 compute_jacobian_kernel<T, S, Is, num_vertices,
95 typename F::ObservationType, F::error_dim, F,
96 typename F::VertexPointerPointerTuple>
97 <<<num_blocks, threads_per_block, 0, streams.select(Is)>>>(
98 f->device_obs.data().get(), f->residuals.data().get(), jacs[Is],
99 f->data.data().get(), f->active_indices.data().get(),
100 f->device_ids.data().get(), num_threads, verts,
101 f->vertex_descriptors[Is]->get_active_state(),
102 std::make_index_sequence<num_vertices>{});
107template <
typename T,
typename S,
typename F>
108void compute_jacobians(F *f, StreamPool &streams) {
109 if (!(f->store_jacobians() || !is_analytical<F>())) {
113 constexpr auto num_vertices = F::get_num_vertices();
114 constexpr auto vertex_sizes = F::get_vertex_sizes();
118 auto verts = f->get_vertices();
119 std::array<S *, num_vertices> jacs;
120 std::array<const size_t *, num_vertices> hessian_ids;
121 for (
int i = 0; i < num_vertices; i++) {
123 jacs[i] = f->jacobians[i].data.data().get();
124 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
127 thrust::fill(thrust::cuda::par_nosync.on(streams.select(i)),
128 f->jacobians[i].data.begin(), f->jacobians[i].data.end(),
132 const auto num_factors = f->active_count();
134 launch_kernel_jacobians<T, S>(f, hessian_ids, verts, jacs, num_factors,
136 std::make_index_sequence<num_vertices>{});
137 streams.sync_n(num_vertices);
140template <
typename highp,
typename T,
size_t I,
size_t N,
size_t E,
size_t D>
142scale_jacobians_kernel(T *jacs,
const highp *jacobian_scales,
143 const size_t *active_ids,
const size_t *ids,
144 const size_t *hessian_ids,
const uint8_t *active_state,
145 const size_t num_threads) {
147 const size_t idx = get_thread_id();
149 if (idx >= num_threads) {
153 constexpr size_t jacobian_size = D * E;
156 const size_t factor_id = active_ids[idx / D];
157 const size_t col = idx % D;
161 const size_t local_id =
164 if (!is_vertex_active(active_state, local_id)) {
168 const auto jacobian_offset = factor_id * jacobian_size;
169 const size_t hessian_offset = hessian_ids[local_id];
170 const highp scale = jacobian_scales[hessian_offset + col];
172 T *Jcol = jacs + jacobian_offset + col * E;
174 for (
size_t i = 0; i < E; i++) {
176 const highp scaled_j =
static_cast<highp
>(Jcol[i]) * scale;
178 Jcol[i] =
static_cast<T
>(scaled_j);
182template <
typename T,
typename S,
typename F, std::size_t... Is>
183void launch_kernel_scale_jacobians(
184 F *f, T *jacobian_scales,
185 std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
186 std::array<S *, F::get_num_vertices()> &jacs,
const size_t num_factors,
187 std::index_sequence<Is...>) {
189 constexpr size_t num_vertices = F::get_num_vertices();
190 constexpr size_t dimension = F::get_vertex_sizes()[Is];
191 const size_t num_threads = num_factors * dimension;
193 size_t threads_per_block = 256;
195 (num_threads + threads_per_block - 1) / threads_per_block;
197 scale_jacobians_kernel<T, S, Is, num_vertices, F::error_dim, dimension>
198 <<<num_blocks, threads_per_block>>>(
199 jacs[Is], jacobian_scales, f->active_indices.data().get(),
200 f->device_ids.data().get(), hessian_ids[Is],
201 f->vertex_descriptors[Is]->get_active_state(), num_threads);
206template <
typename T,
typename S,
typename F>
207void scale_jacobians(F *f, T *jacobian_scales) {
209 if (!(f->store_jacobians() || !is_analytical<F>())) {
213 constexpr auto num_vertices = F::get_num_vertices();
214 constexpr auto vertex_sizes = F::get_vertex_sizes();
218 auto verts = f->get_vertices();
219 std::array<S *, num_vertices> jacs;
220 std::array<const size_t *, num_vertices> hessian_ids;
221 for (
int i = 0; i < num_vertices; i++) {
222 jacs[i] = f->jacobians[i].data.data().get();
223 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
226 const auto num_factors = f->active_count();
228 launch_kernel_scale_jacobians<T, S>(f, jacobian_scales, hessian_ids, jacs,
230 std::make_index_sequence<num_vertices>{});
238template <
typename T,
typename S,
size_t I,
size_t N,
size_t E,
typename F,
240__global__
void compute_b_kernel(T *b,
const T *error,
const size_t *active_ids,
241 const size_t *ids,
const size_t *hessian_ids,
242 const size_t num_threads,
const S *jacs,
243 const uint8_t *active_state,
const S *pmat,
244 const S *loss_derivative,
245 std::index_sequence<Is...>) {
246 const size_t idx = get_thread_id();
248 if (idx >= num_threads) {
252 constexpr auto vertex_sizes = F::get_vertex_sizes();
253 constexpr auto jacobian_size = vertex_sizes[I] * E;
257 const size_t factor_id = active_ids[idx / vertex_sizes[I]];
258 const size_t local_id =
261 if (!is_vertex_active(active_state, local_id)) {
264 const auto jacobian_offset = factor_id * jacobian_size;
265 const auto error_offset = factor_id * E;
267 const auto col_offset = (idx % vertex_sizes[I]) * E;
270 const T dL = loss_derivative[factor_id];
273 constexpr auto precision_matrix_size = E * E;
274 const auto precision_offset = factor_id * precision_matrix_size;
278 for (
int i = 0; i < E; i++) {
280 for (
int j = 0; j < E; j++) {
283 x2[i] += dL * (T)pmat[precision_offset + i * E + j] *
285 error[error_offset + j]);
291 for (
int i = 0; i < E; i++) {
292 value -= (T)jacs[jacobian_offset + col_offset + i] * x2[i];
295 const auto hessian_offset =
296 hessian_ids[local_id];
302 atomicAdd(&b[hessian_offset + (idx % vertex_sizes[I])], value);
305template <
typename T,
typename S,
size_t I,
size_t N,
typename M,
size_t E,
306 typename F,
typename VT, std::size_t... Is>
308compute_b_dynamic_kernel(T *b,
const T *error,
const size_t *active_ids,
309 const size_t *ids,
const size_t *hessian_ids,
310 const size_t num_threads,
const VT args,
const M *obs,
311 const T *jacobian_scales,
312 const typename F::ConstraintDataType *constraint_data,
313 const uint8_t *active_state,
const S *pmat,
314 const S *loss_derivative, std::index_sequence<Is...>) {
315 const size_t idx = get_thread_id();
317 if (idx >= num_threads) {
321 constexpr auto vertex_sizes = F::get_vertex_sizes();
322 constexpr auto jacobian_size = vertex_sizes[I] * E;
326 const size_t factor_id = active_ids[idx / vertex_sizes[I]];
327 const size_t local_id =
330 if (!is_vertex_active(active_state, local_id)) {
333 const auto jacobian_offset = factor_id * jacobian_size;
334 const auto error_offset = factor_id * E;
336 const auto col_offset = (idx % vertex_sizes[I]) * E;
339 const T dL = loss_derivative[factor_id];
342 constexpr auto precision_matrix_size = E * E;
343 const auto precision_offset = factor_id * precision_matrix_size;
347 for (
int i = 0; i < E; i++) {
349 for (
int j = 0; j < E; j++) {
352 x2[i] += dL * (T)pmat[precision_offset + i * E + j] *
354 error[error_offset + j]);
360 T jacobian[jacobian_size];
362 compute_Jblock<T, I, N, M, E, F, VT>(jacobian, factor_id, local_id, obs,
363 constraint_data, ids, hessian_ids, args,
364 std::make_index_sequence<N>{});
365 const auto hessian_offset = hessian_ids[local_id];
366 const auto scale = jacobian_scales[hessian_offset + (idx % vertex_sizes[I])];
368 for (
int i = 0; i < E; i++) {
369 value -= jacobian[col_offset + i] * x2[i];
373 atomicAdd(&b[hessian_offset + (idx % vertex_sizes[I])], value);
376template <
typename T,
typename S,
typename F, std::size_t... Is>
377void launch_kernel_compute_b(
378 F *f, T *b, std::array<
const size_t *, F::get_num_vertices()> &hessian_ids,
379 std::array<S *, F::get_num_vertices()> &jacs,
const T *jacobian_scales,
380 const size_t num_factors, std::index_sequence<Is...>) {
382 constexpr auto num_vertices = F::get_num_vertices();
383 const auto num_threads = num_factors * F::get_vertex_sizes()[Is];
386 size_t threads_per_block = 256;
388 (num_threads + threads_per_block - 1) / threads_per_block;
395 if (f->store_jacobians() || !is_analytical<F>()) {
397 compute_b_kernel<T, S, Is, num_vertices, F::error_dim, F>
398 <<<num_blocks, threads_per_block>>>(
399 b, f->residuals.data().get(), f->active_indices.data().get(),
400 f->device_ids.data().get(), hessian_ids[Is], num_threads,
401 jacs[Is], f->vertex_descriptors[Is]->get_active_state(),
402 f->precision_matrices.data().get(),
403 f->chi2_derivative.data().get(),
404 std::make_index_sequence<num_vertices>{});
408 if constexpr (is_analytical<F>()) {
409 compute_b_dynamic_kernel<T, S, Is, num_vertices,
410 typename F::ObservationType, F::error_dim, F,
411 typename F::VertexPointerPointerTuple>
412 <<<num_blocks, threads_per_block>>>(
413 b, f->residuals.data().get(), f->active_indices.data().get(),
414 f->device_ids.data().get(), hessian_ids[Is], num_threads,
415 f->get_vertices(), f->device_obs.data().get(), jacobian_scales,
416 f->data.data().get(),
417 f->vertex_descriptors[Is]->get_active_state(),
418 f->precision_matrices.data().get(),
419 f->chi2_derivative.data().get(),
420 std::make_index_sequence<num_vertices>{});
427template <
typename T,
typename S,
typename F>
428void compute_b_async(F *f, T *b,
const T *jacobian_scales) {
429 constexpr auto num_vertices = F::get_num_vertices();
430 constexpr auto vertex_sizes = F::get_vertex_sizes();
433 auto verts = f->get_vertices();
434 std::array<S *, num_vertices> jacs;
435 std::array<const size_t *, num_vertices> hessian_ids;
436 for (
int i = 0; i < num_vertices; i++) {
437 jacs[i] = f->jacobians[i].data.data().get();
438 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
441 constexpr auto error_dim = F::error_dim;
442 const auto num_factors = f->active_count();
445 launch_kernel_compute_b<T, S>(f, b, hessian_ids, jacs, jacobian_scales,
447 std::make_index_sequence<num_vertices>{});