Graphite
Loading...
Searching...
No Matches
hessian.hpp
Go to the documentation of this file.
1
2#pragma once
3#include <graphite/block.hpp>
4#include <graphite/common.hpp>
5#include <graphite/stream.hpp>
6#include <graphite/utils.hpp>
7#include <thrust/sort.h>
8#include <thrust/unique.h>
9#include <unordered_map>
10#include <utility>
11
12namespace graphite {
13
14template <typename T, typename I> class CSCMatrix {
15public:
16 thrust::device_vector<I> d_pointers;
17 thrust::device_vector<I> d_indices;
18 thrust::device_vector<T> d_values;
19};
20
21template <typename T> class HessianBlocks {
22public:
23 std::pair<size_t, size_t> dimensions;
24 size_t num_blocks;
25 thrust::device_vector<T> data;
26 thrust::device_vector<BlockCoordinates> block_coordinates;
27
28 void resize(size_t num_blocks, size_t rows, size_t cols) {
29 dimensions = {rows, cols};
30 this->num_blocks = num_blocks;
31 data.resize(rows * cols * num_blocks);
32 block_coordinates.resize(num_blocks);
33 }
34
35 void fill(const T &value) {
36 thrust::fill(thrust::device, data.begin(), data.end(), value);
37 }
38
39 void zero() {
40 thrust::fill(thrust::device, data.begin(), data.end(), static_cast<T>(0));
41 }
42};
43
44template <typename T, typename S> class Hessian {
45public:
46 // Returns coordinates of upper triangular filled-in Hessian blocks
47 std::vector<BlockCoordinates> get_block_coordinates(Graph<T, S> *graph) {
48 // For a constraint to contribute to the Hessian,
49 // the constraint must be active, and both variables must be active
50 // (non-fixed), and the block must reside in the upper triangular part of
51 // the Hessian (i.e., row index <= column index)
52 thrust::device_vector<BlockCoordinates> block_coords;
53 auto &f_desc = graph->get_factor_descriptors();
54 for (auto &f : f_desc) {
55 f->get_hessian_block_coordinates(block_coords);
56 }
57
58 thrust::sort(
59 thrust::device, block_coords.begin(), block_coords.end(),
60 [] __device__(const BlockCoordinates &a, const BlockCoordinates &b) {
61 // sort so that we get blocks in column-major order (e.g consecutive
62 // indices in same column)
63 if (a.col == b.col) {
64 return a.row < b.row;
65 }
66 return a.col < b.col;
67 });
68
69 // Remove duplicates
70 auto end_it = thrust::unique(
71 thrust::device, block_coords.begin(), block_coords.end(),
72 [] __device__(const BlockCoordinates &a, const BlockCoordinates &b) {
73 return (a.row == b.row) && (a.col == b.col);
74 });
75
76 // Resize the vector to remove duplicates
77 block_coords.resize(end_it - block_coords.begin());
78
79 // Copy to host and convert to std::vector
80 std::vector<BlockCoordinates> host_block_coords(block_coords.size());
81 thrust::copy(block_coords.begin(), block_coords.end(),
82 host_block_coords.begin());
83 return host_block_coords;
84 }
85
86 // Block coords must be unique and in column-major order
87 // Build block-csc style indices on the host that we can access on GPU for
88 // quickly constructing a scalar CSC-style Hessian matrix (upper triangle
89 // only)
90
91 void build_indices(Graph<T, S> *graph,
92 const std::vector<BlockCoordinates> &block_coords) {
93 const auto num_columns = graph->get_num_block_columns();
94 thrust::host_vector<size_t> col_pointers(num_columns + 1);
95 thrust::host_vector<size_t> row_indices(block_coords.size());
96 thrust::host_vector<size_t> offsets(block_coords.size());
97 thrust::fill(thrust::host, col_pointers.begin(), col_pointers.end(), 0);
98
99 for (size_t i = 0; i < block_coords.size(); i++) {
100 const auto &coord = block_coords[i];
101
102 col_pointers[coord.col]++;
103 row_indices[i] = coord.row;
104 offsets[i] = block_indices.at(coord);
105 }
106
107 // Transfer to device
108 d_col_pointers = col_pointers;
109 d_row_indices = row_indices;
110 d_offsets = offsets;
111 d_hessian_offsets = graph->get_offset_vector();
112
113 thrust::exclusive_scan(thrust::device, d_col_pointers.begin(),
114 d_col_pointers.end(), d_col_pointers.begin());
115
116 // Also need a map of scalar column to block column for constructing CSR
117 // matrices
118 scalar_to_block_map.resize(graph->get_hessian_dimension());
119 auto s2b_map = scalar_to_block_map.data().get();
120 const auto bc_offset = d_hessian_offsets.data().get();
121 thrust::for_each(
122 thrust::device, thrust::make_counting_iterator<size_t>(0),
123 thrust::make_counting_iterator<size_t>(graph->get_num_block_columns()),
124 [=] __device__(size_t block_column) {
125 const size_t hessian_col = bc_offset[block_column];
126 const auto dim = bc_offset[block_column + 1] - hessian_col;
127
128 for (size_t i = 0; i < dim; i++) {
129 s2b_map[hessian_col + i] = block_column;
130 }
131 });
132 }
133
134 void backup_diagonal(Graph<T, S> *graph, StreamPool &streams) {
135 d_prev_diagonal.resize(graph->get_hessian_dimension());
136
137 auto diag = d_prev_diagonal.data().get();
138 const auto h_offsets = d_hessian_offsets.data().get();
139 const auto p_col = d_col_pointers.data().get();
140 const auto r_idx = d_row_indices.data().get();
141 const auto block_locations = d_offsets.data().get();
142 const auto h = d_hessian.data().get();
143
144 thrust::for_each(
145 thrust::device, thrust::make_counting_iterator<size_t>(0),
146 thrust::make_counting_iterator<size_t>(graph->get_num_block_columns()),
147 [=] __device__(size_t block_col) {
148 const size_t hessian_col = h_offsets[block_col];
149 const size_t dim = h_offsets[block_col + 1] - hessian_col;
150
151 // find diagonal block in column where row == col
152 // const auto start = p_col[block_col];
153 const auto end = p_col[block_col + 1];
154
155 const auto b = end - 1; // assume there is always a diagonal block and
156 // it is always last
157
158 if (r_idx[b] == block_col) {
159 // found diagonal block, copy elements
160 const auto block = h + block_locations[b];
161 for (size_t i = 0; i < dim; i++) {
162 diag[hessian_col + i] = block[i * dim + i];
163 }
164 }
165 });
166 }
167
168 void apply_damping(Graph<T, S> *graph, T damping_factor,
169 StreamPool &streams) {
170 // Assume diagonal was already backed up when recomputing Hessian values
171 auto diag = d_prev_diagonal.data().get();
172 const auto h_offsets = d_hessian_offsets.data().get();
173 const auto p_col = d_col_pointers.data().get();
174 const auto r_idx = d_row_indices.data().get();
175 auto block_locations = d_offsets.data().get();
176 auto h = d_hessian.data().get();
177
178 thrust::for_each(
179 thrust::device, thrust::make_counting_iterator<size_t>(0),
180 thrust::make_counting_iterator<size_t>(graph->get_num_block_columns()),
181 [=] __device__(size_t block_col) {
182 const size_t hessian_col = h_offsets[block_col];
183 const size_t dim = h_offsets[block_col + 1] - hessian_col;
184
185 // find diagonal block in column where row == col
186 // const auto start = p_col[block_col];
187 const auto end = p_col[block_col + 1];
188 const auto b = end - 1; // assume there is always a diagonal block and
189 // it is always last
190
191 if (r_idx[b] == block_col) {
192 // found diagonal block, backup then apply damping
193 auto block = h + block_locations[b];
194 for (size_t i = 0; i < dim; i++) {
195 const auto h_diag = diag[hessian_col + i];
196 block[i * dim + i] =
197 (S)((double)h_diag * (1.0 + (double)damping_factor));
198 }
199 }
200 });
201 }
202
203 void setup_hessian_computation(Graph<T, S> *graph, StreamPool &streams) {
204 // Build offset buffer for multiplying Jacobian blocks.
205 // Each offset corresponds to the start index of an output Hessian block.\
206
207 // Generate the offsets for each product, for each factor
208
209 // First we need to compute the upper bound for the size of h_block_offsets
210 const auto &factors = graph->get_factor_descriptors();
211 size_t mul_count = 0;
212 for (auto &f : factors) {
213 const size_t num_vertices = f->get_num_descriptors();
214 for (size_t i = 0; i < num_vertices; i++) {
215 for (size_t j = i; j < num_vertices; j++) {
216 mul_count += f->active_count();
217 }
218 }
219 }
220
221 h_block_offsets.resize(mul_count);
222
223 // Now compute the offsets
224 mul_count = 0;
225 for (auto &f : factors) {
226 mul_count += f->setup_hessian_computation(
227 block_indices, d_hessian, h_block_offsets.data() + mul_count,
228 streams);
229 }
230
231 // Final copy
232 d_block_offsets = h_block_offsets;
233 }
234
235 void execute_hessian_computation(Graph<T, S> *graph, StreamPool &streams) {
236 thrust::fill(thrust::device, d_hessian.begin(), d_hessian.end(),
237 static_cast<S>(0.0));
238 size_t mul_count = 0;
239 const auto &factors = graph->get_factor_descriptors();
240 for (auto &f : factors) {
241 mul_count += f->execute_hessian_computation(
242 block_indices, d_hessian, d_block_offsets.data().get() + mul_count,
243 streams);
244 }
245 }
246
247 // Data
248 std::unordered_map<BlockCoordinates, size_t> block_indices;
249 thrust::device_vector<S> d_hessian;
250 thrust::device_vector<size_t> d_col_pointers;
251 thrust::device_vector<size_t> d_row_indices;
252 thrust::device_vector<size_t> d_offsets;
253 thrust::device_vector<S> d_prev_diagonal;
254 thrust::device_vector<size_t> d_hessian_offsets;
255 thrust::device_vector<size_t> scalar_to_block_map;
256
257 // For calculating Hessian blocks
258 thrust::host_vector<size_t> h_block_offsets;
259 thrust::device_vector<size_t> d_block_offsets;
260
261public:
262 Hessian() = default;
263
264 void build_structure(Graph<T, S> *graph, StreamPool &streams) {
265 // Implementation for building the Hessian matrix
266
267 // Assume we don't have a GPU hash map.
268 // First we need to count how many Hessian blocks we have
269 // We'll create a set of Hessian block coordinates by iterating over
270 // descriptors Ignore blocks not in the upper triangular part
271 const auto block_coords = get_block_coordinates(graph);
272
273 // Then we need to allocate memory for each block
274 // We can iterate the set and figure out the total memory,
275 // then allocate a big chunk and assign pointers accordingly
276 // TODO: Maybe we can use an GPU exclusive scan instead?
277 size_t num_values = 0;
278 block_indices.clear();
279 for (const auto &coord : block_coords) {
280 block_indices[coord] = num_values;
281 num_values += graph->get_variable_dimension(coord.row) *
282 graph->get_variable_dimension(coord.col);
283 }
284 d_hessian.resize(num_values);
285
286 // We need to end up with a block CSC-style representation
287 // where we can iterate down the blocks in each block columnn
288 // and retrieve the data pointer for each block for the purpose of
289 // constructing a scalar CSC-style representation.
290 build_indices(graph, block_coords);
291
292 // Setup data for computing Hessian as product of Jacobians
293 setup_hessian_computation(graph, streams);
294 }
295
296 void update_values(Graph<T, S> *graph, StreamPool &streams) {
297 // thrust::fill(thrust::device, d_hessian.begin(), d_hessian.end(),
298 // static_cast<S>(0.0));
299
300 // setup_hessian_computation(graph, streams);
301 execute_hessian_computation(graph, streams);
302
303 // auto &f_desc = graph->get_factor_descriptors();
304 // for (auto &f : f_desc) {
305 // h_block_offsets.clear();
306 // d_block_offsets.clear();
307 // f->compute_hessian_blocks(block_indices, d_hessian, h_block_offsets,
308 // d_block_offsets, streams);
309 // }
310
311 // Back up diagonal for damping purposes
312 backup_diagonal(graph, streams);
313 }
314
315 template <typename I>
316 void build_csc_structure(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
317
318 const auto hessian_dim = graph->get_hessian_dimension();
319 matrix.d_pointers.resize(hessian_dim + 1);
320 thrust::fill(thrust::device, matrix.d_pointers.begin(),
321 matrix.d_pointers.end(), 0);
322
323 // Now we count how many values in each column.
324 // Note that we're using a block CSC variant (each block is column major)
325 // and we can use this to construct a lower triangular CSR matrix or upper
326 // triangular CSC matrix because the Hessian is symmetric.
327
328 const auto h = d_hessian.data().get();
329 const auto p_col = d_col_pointers.data().get();
330 const auto r_idx = d_row_indices.data().get();
331 const auto block_locations = d_offsets.data().get();
332 const auto hessian_offsets =
333 d_hessian_offsets.data().get(); // this is scalar offset
334 const auto s2b_map = scalar_to_block_map.data().get();
335
336 auto scalar_ptrs = matrix.d_pointers.data().get();
337
338 thrust::for_each(thrust::device, thrust::make_counting_iterator<size_t>(0),
339 thrust::make_counting_iterator<size_t>(hessian_dim),
340 [=] __device__(const size_t hessian_col) {
341 const auto block_col = s2b_map[hessian_col];
342
343 const auto start = p_col[block_col];
344 const auto end = p_col[block_col + 1];
345 size_t num_values =
346 0; // values in upper triangle scalar column
347 // Iterate through each block in the column
348 for (size_t b = start; b < end; b++) {
349
350 // Stop if col == row because our column is sorted by
351 // block row and we only want upper triangle
352 const auto block_row = r_idx[b];
353 const auto nrows = hessian_offsets[block_row + 1] -
354 hessian_offsets[block_row];
355
356 auto scalar_row = hessian_offsets[block_row];
357
358 for (size_t r = 0; r < nrows; r++) {
359 if (scalar_row + r <= hessian_col) {
360 num_values++;
361 } else {
362 break;
363 }
364 }
365 }
366 scalar_ptrs[hessian_col] = num_values;
367 });
368
369 // Now sum up everything
370 thrust::exclusive_scan(thrust::device, matrix.d_pointers.begin(),
371 matrix.d_pointers.end(), matrix.d_pointers.begin());
372
373 // Now we need to iterate through columns again and populate
374 // the column indices and values
375 const size_t nnz =
376 matrix.d_pointers[hessian_dim]; // expensive! we're accessing device
377 // memory
378 matrix.d_indices.resize(nnz);
379 matrix.d_values.resize(nnz);
380
381 auto p_row_indices = matrix.d_indices.data().get();
382
383 // Update indices
384 thrust::for_each(thrust::device, thrust::make_counting_iterator<size_t>(0),
385 thrust::make_counting_iterator<size_t>(hessian_dim),
386 [=] __device__(const size_t hessian_col) {
387 const auto block_col = s2b_map[hessian_col];
388
389 // find diagonal block in column where row == col
390 const auto start = p_col[block_col];
391 const auto end = p_col[block_col + 1];
392 size_t write_idx = scalar_ptrs[hessian_col];
393 // Iterate through each block in the column
394 for (size_t b = start; b < end; b++) {
395
396 // stop if col == row
397 const auto block_row = r_idx[b];
398 const auto nrows = hessian_offsets[block_row + 1] -
399 hessian_offsets[block_row];
400
401 auto scalar_row = hessian_offsets[block_row];
402
403 for (size_t r = 0; r < nrows; r++) {
404 if (scalar_row + r <= hessian_col) {
405 // only write out upper triangle
406 p_row_indices[write_idx++] = scalar_row + r;
407 } else {
408 break;
409 }
410 }
411 }
412 });
413 }
414
415 template <typename I>
416 void update_csc_values(Graph<T, S> *graph, CSCMatrix<S, I> &matrix) {
417
418 const auto hessian_dim = graph->get_hessian_dimension();
419 const auto h = d_hessian.data().get();
420 const auto p_col = d_col_pointers.data().get();
421 const auto r_idx = d_row_indices.data().get();
422 const auto block_locations = d_offsets.data().get();
423 const auto hessian_offsets =
424 d_hessian_offsets.data().get(); // this is scalar offset
425 const auto s2b_map = scalar_to_block_map.data().get();
426
427 auto scalar_row_ptrs = matrix.d_pointers.data().get();
428
429 auto p_values = matrix.d_values.data().get();
430
431 thrust::for_each(
432 thrust::device, thrust::make_counting_iterator<size_t>(0),
433 thrust::make_counting_iterator<size_t>(hessian_dim),
434 [=] __device__(const size_t hessian_col) {
435 const auto block_col = s2b_map[hessian_col];
436
437 // find diagonal block in column where row == col
438 const auto start = p_col[block_col];
439 const auto end = p_col[block_col + 1];
440 size_t write_idx = scalar_row_ptrs[hessian_col];
441 // Iterate through each block in the column
442 for (size_t b = start; b < end; b++) {
443
444 // stop if col == row
445 const auto block_row = r_idx[b];
446 const auto nrows =
447 hessian_offsets[block_row + 1] - hessian_offsets[block_row];
448
449 auto scalar_row = hessian_offsets[block_row];
450
451 const auto col_in_block = hessian_col - hessian_offsets[block_col];
452
453 const auto block = h + block_locations[b] + col_in_block * nrows;
454
455 for (size_t r = 0; r < nrows; r++) {
456 if (scalar_row + r <= hessian_col) {
457 // only write out upper triangle
458 p_values[write_idx++] = block[r];
459 } else {
460 break;
461 }
462 }
463 }
464 });
465 }
466};
467
468} // namespace graphite
Definition block.hpp:6
Definition hessian.hpp:14
Definition hessian.hpp:21
Definition hessian.hpp:44
Definition stream.hpp:7