Graphite
Loading...
Searching...
No Matches
linearize.hpp
Go to the documentation of this file.
1
2#pragma once
4
5namespace graphite {
6namespace ops {
7
8template <typename T, typename S, size_t I, size_t N, typename M, size_t E,
9 typename F, typename VT, std::size_t... Is>
10__global__ void compute_jacobian_kernel(
11 const M *obs, T *error, S *jacs,
12 const typename F::ConstraintDataType *constraint_data,
13 const size_t *active_ids, const size_t *ids, const size_t num_threads,
14 const VT args, const uint8_t *active_state, std::index_sequence<Is...>) {
15 const size_t idx = get_thread_id();
16
17 if (idx >= num_threads) {
18 return;
19 }
20
21 constexpr auto vertex_sizes = F::get_vertex_sizes();
22 const auto factor_id = active_ids[idx];
23 const size_t vertex_id = ids[factor_id * N + I];
24 if (!is_vertex_active(active_state, vertex_id)) {
25 return;
26 }
27
28 const M *local_obs = obs + factor_id;
29 // T *local_error = error + factor_id * E;
30 const typename F::ConstraintDataType *local_data =
31 constraint_data + factor_id;
32
33 auto vargs = cuda::std::make_tuple(
34 (*(std::get<Is>(args) + ids[factor_id * N + Is]))...);
35
36 constexpr size_t jacobian_size = E * vertex_sizes[I];
37
38 S *j = jacs + factor_id * jacobian_size;
39 using DataType = typename F::ConstraintDataType;
40 using ObsType = typename F::ObservationType;
41
42 // if constexpr (is_low_precision<S>::value) {
43 if constexpr (!std::is_same<T, S>::value) {
44 T jacobian[jacobian_size];
45
46 if constexpr (std::is_same<Empty, ObsType>::value &&
47 std::is_same<Empty, DataType>::value) {
48 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
49 jacobian);
50 } else if constexpr (std::is_same<Empty, DataType>::value) {
51 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
52 *local_obs, jacobian);
53 } else if constexpr (std::is_same<Empty, ObsType>::value) {
54 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
55 *local_data, jacobian);
56 } else {
57 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
58 *local_obs, *local_data, jacobian);
59 }
60
61#pragma unroll
62 for (size_t i = 0; i < jacobian_size; ++i) {
63 j[i] = jacobian[i];
64 }
65 } else {
66 if constexpr (std::is_same<Empty, ObsType>::value &&
67 std::is_same<Empty, DataType>::value) {
68 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))..., j);
69 } else if constexpr (std::is_same<Empty, DataType>::value) {
70 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
71 *local_obs, j);
72 } else if constexpr (std::is_same<Empty, ObsType>::value) {
73 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
74 *local_data, j);
75 } else {
76 F::Traits::template jacobian<T, I>((*cuda::std::get<Is>(vargs))...,
77 *local_obs, *local_data, j);
78 }
79 }
80}
81
82template <typename T, typename S, typename F, typename VT, std::size_t... Is>
83void launch_kernel_jacobians(
84 F *f, std::array<const size_t *, F::get_num_vertices()> &hessian_ids,
85 VT &verts, std::array<S *, F::get_num_vertices()> &jacs,
86 const size_t num_factors, StreamPool &streams, std::index_sequence<Is...>) {
87 (([&] {
88 constexpr auto num_vertices = F::get_num_vertices();
89 const auto num_threads = num_factors;
90 size_t threads_per_block = 256;
91 size_t num_blocks =
92 (num_threads + threads_per_block - 1) / threads_per_block;
93
94 compute_jacobian_kernel<T, S, Is, num_vertices,
95 typename F::ObservationType, F::error_dim, F,
96 typename F::VertexPointerPointerTuple>
97 <<<num_blocks, threads_per_block, 0, streams.select(Is)>>>(
98 f->device_obs.data().get(), f->residuals.data().get(), jacs[Is],
99 f->data.data().get(), f->active_indices.data().get(),
100 f->device_ids.data().get(), num_threads, verts,
101 f->vertex_descriptors[Is]->get_active_state(),
102 std::make_index_sequence<num_vertices>{});
103 }()),
104 ...);
105}
106
107template <typename T, typename S, typename F>
108void compute_jacobians(F *f, StreamPool &streams) {
109 if (!(f->store_jacobians() || !is_analytical<F>())) {
110 return;
111 }
112 // Then for each vertex, we need to compute the error
113 constexpr auto num_vertices = F::get_num_vertices();
114 constexpr auto vertex_sizes = F::get_vertex_sizes();
115
116 // At this point all necessary data should be on the GPU
117 // std::array<T*, num_vertices> verts;
118 auto verts = f->get_vertices();
119 std::array<S *, num_vertices> jacs;
120 std::array<const size_t *, num_vertices> hessian_ids;
121 for (int i = 0; i < num_vertices; i++) {
122 // verts[i] = f->vertex_descriptors[i]->vertices();
123 jacs[i] = f->jacobians[i].data.data().get();
124 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
125
126 // Important: Must clear Jacobian storage
127 thrust::fill(thrust::cuda::par_nosync.on(streams.select(i)),
128 f->jacobians[i].data.begin(), f->jacobians[i].data.end(),
129 static_cast<S>(0));
130 }
131
132 const auto num_factors = f->active_count();
133
134 launch_kernel_jacobians<T, S>(f, hessian_ids, verts, jacs, num_factors,
135 streams,
136 std::make_index_sequence<num_vertices>{});
137 streams.sync_n(num_vertices);
138}
139
140template <typename highp, typename T, size_t I, size_t N, size_t E, size_t D>
141__global__ void
142scale_jacobians_kernel(T *jacs, const highp *jacobian_scales,
143 const size_t *active_ids, const size_t *ids,
144 const size_t *hessian_ids, const uint8_t *active_state,
145 const size_t num_threads) {
146
147 const size_t idx = get_thread_id();
148
149 if (idx >= num_threads) {
150 return;
151 }
152
153 constexpr size_t jacobian_size = D * E;
154
155 // This time, each factor will have D threads
156 const size_t factor_id = active_ids[idx / D];
157 const size_t col = idx % D;
158
159 // Stored as E x d col major, but we need to transpose it to d x E, where d is
160 // the vertex size
161 const size_t local_id =
162 ids[factor_id * N +
163 I]; // N is the number of vertices involved in the factor
164 if (!is_vertex_active(active_state, local_id)) {
165 return;
166 }
167
168 const auto jacobian_offset = factor_id * jacobian_size;
169 const size_t hessian_offset = hessian_ids[local_id];
170 const highp scale = jacobian_scales[hessian_offset + col];
171
172 T *Jcol = jacs + jacobian_offset + col * E;
173#pragma unroll
174 for (size_t i = 0; i < E; i++) {
175
176 const highp scaled_j = static_cast<highp>(Jcol[i]) * scale;
177
178 Jcol[i] = static_cast<T>(scaled_j);
179 }
180}
181
182template <typename T, typename S, typename F, std::size_t... Is>
183void launch_kernel_scale_jacobians(
184 F *f, T *jacobian_scales,
185 std::array<const size_t *, F::get_num_vertices()> &hessian_ids,
186 std::array<S *, F::get_num_vertices()> &jacs, const size_t num_factors,
187 std::index_sequence<Is...>) {
188 (([&] {
189 constexpr size_t num_vertices = F::get_num_vertices();
190 constexpr size_t dimension = F::get_vertex_sizes()[Is];
191 const size_t num_threads = num_factors * dimension;
192
193 size_t threads_per_block = 256;
194 size_t num_blocks =
195 (num_threads + threads_per_block - 1) / threads_per_block;
196
197 scale_jacobians_kernel<T, S, Is, num_vertices, F::error_dim, dimension>
198 <<<num_blocks, threads_per_block>>>(
199 jacs[Is], jacobian_scales, f->active_indices.data().get(),
200 f->device_ids.data().get(), hessian_ids[Is],
201 f->vertex_descriptors[Is]->get_active_state(), num_threads);
202 }()),
203 ...);
204}
205
206template <typename T, typename S, typename F>
207void scale_jacobians(F *f, T *jacobian_scales) {
208
209 if (!(f->store_jacobians() || !is_analytical<F>())) {
210 return;
211 }
212 // Then for each vertex, we need to compute the error
213 constexpr auto num_vertices = F::get_num_vertices();
214 constexpr auto vertex_sizes = F::get_vertex_sizes();
215
216 // At this point all necessary data should be on the GPU
217 // std::array<T*, num_vertices> verts;
218 auto verts = f->get_vertices();
219 std::array<S *, num_vertices> jacs;
220 std::array<const size_t *, num_vertices> hessian_ids;
221 for (int i = 0; i < num_vertices; i++) {
222 jacs[i] = f->jacobians[i].data.data().get();
223 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
224 }
225
226 const auto num_factors = f->active_count();
227
228 launch_kernel_scale_jacobians<T, S>(f, jacobian_scales, hessian_ids, jacs,
229 num_factors,
230 std::make_index_sequence<num_vertices>{});
231}
232
233// The output will be part of b with length of the vertex (where b = -J^T * r)
234// Note the negative sign - different papers use different conventions
235// TODO: Replace with generic J^T x r kernel?
236// Note: The error vector is local to the factor
237// Include precision matrix
238template <typename T, typename S, size_t I, size_t N, size_t E, typename F,
239 std::size_t... Is>
240__global__ void compute_b_kernel(T *b, const T *error, const size_t *active_ids,
241 const size_t *ids, const size_t *hessian_ids,
242 const size_t num_threads, const S *jacs,
243 const uint8_t *active_state, const S *pmat,
244 const S *loss_derivative,
245 std::index_sequence<Is...>) {
246 const size_t idx = get_thread_id();
247
248 if (idx >= num_threads) {
249 return;
250 }
251
252 constexpr auto vertex_sizes = F::get_vertex_sizes();
253 constexpr auto jacobian_size = vertex_sizes[I] * E;
254
255 // Stored as E x d col major, but we need to transpose it to d x E, where d is
256 // the vertex size
257 const size_t factor_id = active_ids[idx / vertex_sizes[I]];
258 const size_t local_id =
259 ids[factor_id * N +
260 I]; // N is the number of vertices involved in the factor
261 if (!is_vertex_active(active_state, local_id)) {
262 return;
263 }
264 const auto jacobian_offset = factor_id * jacobian_size;
265 const auto error_offset = factor_id * E;
266 // constexpr auto col_offset = I*E; // for untransposed J
267 const auto col_offset = (idx % vertex_sizes[I]) * E; // for untransposed J
268
269 // Use loss kernel
270 const T dL = loss_derivative[factor_id];
271
272 T value = 0.0;
273 constexpr auto precision_matrix_size = E * E;
274 const auto precision_offset = factor_id * precision_matrix_size;
275 T x2[E] = {0.0};
276
277#pragma unroll
278 for (int i = 0; i < E; i++) { // pmat row
279#pragma unroll
280 for (int j = 0; j < E; j++) { // pmat col
281 // x2[i] += pmat[precision_offset + i + j*E] * error[error_offset + j]; //
282 // col major
283 x2[i] += dL * (T)pmat[precision_offset + i * E + j] *
284 static_cast<T>(
285 error[error_offset + j]); // row major (use for faster access
286 // on symmetrical matrix)
287 }
288 }
289
290#pragma unroll
291 for (int i = 0; i < E; i++) {
292 value -= (T)jacs[jacobian_offset + col_offset + i] * x2[i];
293 }
294
295 const auto hessian_offset =
296 hessian_ids[local_id]; // each vertex has a hessian_ids array
297
298 // printf("Hessian offset: %u\n", hessian_offset);
299 // printf("Adding b[%d] += %f\n", hessian_offset + (idx % vertex_sizes[I]),
300 // value); printf("Thread %d, Hessian offset: %u\n", idx, hessian_offset);
301
302 atomicAdd(&b[hessian_offset + (idx % vertex_sizes[I])], value);
303}
304
305template <typename T, typename S, size_t I, size_t N, typename M, size_t E,
306 typename F, typename VT, std::size_t... Is>
307__global__ void
308compute_b_dynamic_kernel(T *b, const T *error, const size_t *active_ids,
309 const size_t *ids, const size_t *hessian_ids,
310 const size_t num_threads, const VT args, const M *obs,
311 const T *jacobian_scales,
312 const typename F::ConstraintDataType *constraint_data,
313 const uint8_t *active_state, const S *pmat,
314 const S *loss_derivative, std::index_sequence<Is...>) {
315 const size_t idx = get_thread_id();
316
317 if (idx >= num_threads) {
318 return;
319 }
320
321 constexpr auto vertex_sizes = F::get_vertex_sizes();
322 constexpr auto jacobian_size = vertex_sizes[I] * E;
323
324 // Stored as E x d col major, but we need to transpose it to d x E, where d is
325 // the vertex size
326 const size_t factor_id = active_ids[idx / vertex_sizes[I]];
327 const size_t local_id =
328 ids[factor_id * N +
329 I]; // N is the number of vertices involved in the factor
330 if (!is_vertex_active(active_state, local_id)) {
331 return;
332 }
333 const auto jacobian_offset = factor_id * jacobian_size;
334 const auto error_offset = factor_id * E;
335 // constexpr auto col_offset = I*E; // for untransposed J
336 const auto col_offset = (idx % vertex_sizes[I]) * E; // for untransposed J
337
338 // Use loss kernel
339 const T dL = loss_derivative[factor_id];
340
341 T value = 0.0;
342 constexpr auto precision_matrix_size = E * E;
343 const auto precision_offset = factor_id * precision_matrix_size;
344 T x2[E] = {0.0};
345
346#pragma unroll
347 for (int i = 0; i < E; i++) { // pmat row
348#pragma unroll
349 for (int j = 0; j < E; j++) { // pmat col
350 // x2[i] += pmat[precision_offset + i + j*E] * error[error_offset + j]; //
351 // col major
352 x2[i] += dL * (T)pmat[precision_offset + i * E + j] *
353 static_cast<T>(
354 error[error_offset + j]); // row major (use for faster access
355 // on symmetrical matrix)
356 }
357 }
358
359 // using G = std::conditional_t<is_low_precision<S>::value, T, S>;
360 T jacobian[jacobian_size];
361
362 compute_Jblock<T, I, N, M, E, F, VT>(jacobian, factor_id, local_id, obs,
363 constraint_data, ids, hessian_ids, args,
364 std::make_index_sequence<N>{});
365 const auto hessian_offset = hessian_ids[local_id];
366 const auto scale = jacobian_scales[hessian_offset + (idx % vertex_sizes[I])];
367#pragma unroll
368 for (int i = 0; i < E; i++) {
369 value -= jacobian[col_offset + i] * x2[i];
370 }
371 value *= scale;
372
373 atomicAdd(&b[hessian_offset + (idx % vertex_sizes[I])], value);
374}
375
376template <typename T, typename S, typename F, std::size_t... Is>
377void launch_kernel_compute_b(
378 F *f, T *b, std::array<const size_t *, F::get_num_vertices()> &hessian_ids,
379 std::array<S *, F::get_num_vertices()> &jacs, const T *jacobian_scales,
380 const size_t num_factors, std::index_sequence<Is...>) {
381 (([&] {
382 constexpr auto num_vertices = F::get_num_vertices();
383 const auto num_threads = num_factors * F::get_vertex_sizes()[Is];
384 // std::cout << "Launching compute b kernel" << std::endl;
385 // std::cout << "Num threads: " << num_threads << std::endl;
386 size_t threads_per_block = 256;
387 size_t num_blocks =
388 (num_threads + threads_per_block - 1) / threads_per_block;
389
390 // std::cout << "Checking obs ptr: " << f->device_obs.data().get() <<
391 // std::endl; std::cout << "Checking residual ptr: " <<
392 // f->residuals.data().get() << std::endl; std::cout << "Checking ids
393 // ptr: " << f->device_ids.data().get() << std::endl;
394
395 if (f->store_jacobians() || !is_analytical<F>()) {
396
397 compute_b_kernel<T, S, Is, num_vertices, F::error_dim, F>
398 <<<num_blocks, threads_per_block>>>(
399 b, f->residuals.data().get(), f->active_indices.data().get(),
400 f->device_ids.data().get(), hessian_ids[Is], num_threads,
401 jacs[Is], f->vertex_descriptors[Is]->get_active_state(),
402 f->precision_matrices.data().get(),
403 f->chi2_derivative.data().get(),
404 std::make_index_sequence<num_vertices>{});
405
406 } else {
407 // std::cout << "Launching compute b dynamic kernel" << std::endl;
408 if constexpr (is_analytical<F>()) {
409 compute_b_dynamic_kernel<T, S, Is, num_vertices,
410 typename F::ObservationType, F::error_dim, F,
411 typename F::VertexPointerPointerTuple>
412 <<<num_blocks, threads_per_block>>>(
413 b, f->residuals.data().get(), f->active_indices.data().get(),
414 f->device_ids.data().get(), hessian_ids[Is], num_threads,
415 f->get_vertices(), f->device_obs.data().get(), jacobian_scales,
416 f->data.data().get(),
417 f->vertex_descriptors[Is]->get_active_state(),
418 f->precision_matrices.data().get(),
419 f->chi2_derivative.data().get(),
420 std::make_index_sequence<num_vertices>{});
421 }
422 }
423 }()),
424 ...);
425}
426
427template <typename T, typename S, typename F>
428void compute_b_async(F *f, T *b, const T *jacobian_scales) {
429 constexpr auto num_vertices = F::get_num_vertices();
430 constexpr auto vertex_sizes = F::get_vertex_sizes();
431
432 // std::array<T*, num_vertices> verts;
433 auto verts = f->get_vertices();
434 std::array<S *, num_vertices> jacs;
435 std::array<const size_t *, num_vertices> hessian_ids;
436 for (int i = 0; i < num_vertices; i++) {
437 jacs[i] = f->jacobians[i].data.data().get();
438 hessian_ids[i] = f->vertex_descriptors[i]->get_hessian_ids();
439 }
440
441 constexpr auto error_dim = F::error_dim;
442 const auto num_factors = f->active_count();
443 // std::cout << "Computing b for " << num_factors << " factors" <<
444 // std::endl;
445 launch_kernel_compute_b<T, S>(f, b, hessian_ids, jacs, jacobian_scales,
446 num_factors,
447 std::make_index_sequence<num_vertices>{});
448 // cudaStreamSynchronize(0);
449}
450
451} // namespace ops
452} // namespace graphite