14template <
typename T>
struct MulOp {
40__global__
void count_pose_rows_per_landmark_column_kernel(
41 const size_t *col_pointers,
const size_t *row_indices,
42 size_t landmark_col_start,
size_t num_block_columns,
size_t *pose_counts) {
43 const size_t idx = get_thread_id();
44 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
45 if (idx >= num_landmark_cols) {
49 const size_t l = landmark_col_start + idx;
50 const size_t col_start = col_pointers[l];
51 const size_t col_end = col_pointers[l + 1];
54 for (
size_t ka = col_start; ka < col_end; ka++) {
55 if (row_indices[ka] >= landmark_col_start) {
60 pose_counts[idx] = count;
63__global__
void fill_schur_structure_pairs_kernel(
64 const size_t *col_pointers,
const size_t *row_indices,
65 const size_t landmark_col_start,
const size_t num_block_columns,
66 const size_t *pose_counts,
const size_t *pair_offsets,
67 BlockCoordinates *pairs_out) {
68 const size_t idx = get_thread_id();
69 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
70 if (idx >= num_landmark_cols) {
74 const size_t l = landmark_col_start + idx;
75 const size_t col_start = col_pointers[l];
76 const size_t pose_count = pose_counts[idx];
77 size_t out_offset = pair_offsets[idx];
79 for (
size_t a = 0; a < pose_count; a++) {
80 const size_t i = row_indices[col_start + a];
81 for (
size_t b = a; b < pose_count; b++) {
82 const size_t j = row_indices[col_start + b];
83 pairs_out[out_offset++] = BlockCoordinates{i, j};
88__global__
void fill_schur_mul_tuples_kernel(
89 const size_t *col_pointers,
const size_t *row_indices,
90 const size_t *block_offsets,
size_t landmark_col_start,
91 size_t num_block_columns,
const size_t *pose_counts,
92 const size_t *pair_offsets, SchurMulTuple *tuples_out) {
93 const size_t idx = get_thread_id();
94 const size_t num_landmark_cols = num_block_columns - landmark_col_start;
95 if (idx >= num_landmark_cols) {
99 const size_t l = landmark_col_start + idx;
100 const size_t col_start = col_pointers[l];
101 const size_t pose_count = pose_counts[idx];
102 size_t out_offset = pair_offsets[idx];
104 for (
size_t a = 0; a < pose_count; a++) {
105 const size_t ka = col_start + a;
106 const size_t i = row_indices[ka];
107 const size_t left_offset = block_offsets[ka];
108 for (
size_t b = a; b < pose_count; b++) {
109 const size_t kb = col_start + b;
110 const size_t j = row_indices[kb];
111 tuples_out[out_offset++] =
112 SchurMulTuple{l, i, j, left_offset, block_offsets[kb]};
117template <
typename T,
typename S>
119schur_block_product_kernel(
const MulOp<S> *ops,
const size_t num_ops,
120 const size_t dim_a,
const size_t dim_b,
121 const size_t dim_c) {
122 const size_t idx = get_thread_id();
123 const size_t block_size = dim_a * dim_c;
124 const size_t op_id = idx / block_size;
125 if (op_id >= num_ops) {
129 const size_t offset = idx % block_size;
130 const size_t row = offset % dim_a;
131 const size_t col = offset / dim_a;
133 const auto &op = ops[op_id];
134 const S *left = op.left;
135 const S *middle = op.middle;
136 const S *right = op.right;
141 for (
size_t k = 0; k < dim_b; k++) {
144 for (
size_t j = 0; j < dim_b; j++) {
145 m_rt +=
static_cast<T
>(middle[k + j * dim_b]) *
146 static_cast<T
>(right[col + j * dim_c]);
148 value +=
static_cast<T
>(left[row + k * dim_a]) * m_rt;
151 atomicAdd(op.destination + (row + col * dim_a),
static_cast<S
>(-value));
154template <
int DIM_B,
typename T,
typename S>
156schur_block_product_kernel_dim_b(
const MulOp<S> *ops,
const size_t num_ops,
157 const size_t dim_a,
const size_t dim_c) {
158 const size_t idx = get_thread_id();
159 const size_t block_size = dim_a * dim_c;
160 const size_t op_id = idx / block_size;
161 if (op_id >= num_ops) {
165 const size_t offset = idx % block_size;
166 const size_t row = offset % dim_a;
167 const size_t col = offset / dim_a;
169 const auto &op = ops[op_id];
170 const S *left = op.left;
171 const S *middle = op.middle;
172 const S *right = op.right;
177 for (
int k = 0; k < DIM_B; k++) {
180 for (
int j = 0; j < DIM_B; j++) {
181 m_rt +=
static_cast<T
>(middle[k + j * DIM_B]) *
182 static_cast<T
>(right[col +
static_cast<size_t>(j) * dim_c]);
184 value +=
static_cast<T
>(left[row +
static_cast<size_t>(k) * dim_a]) * m_rt;
187 atomicAdd(op.destination + (row + col * dim_a),
static_cast<S
>(-value));
190template <
typename highp,
typename S,
typename T>
191__global__
void block_matvec_assign_batched_kernel(
192 const S *values,
const size_t *a_offsets,
const T *x_base, T *y_base,
193 const size_t *vec_offsets,
size_t num_blocks,
size_t dim) {
194 const size_t idx = get_thread_id();
195 const size_t total_rows = num_blocks * dim;
196 if (idx >= total_rows) {
200 const size_t block_id = idx / dim;
201 const size_t row = idx % dim;
203 const S *A = values + a_offsets[block_id];
204 const size_t vec_offset = vec_offsets[block_id];
205 const T *x = x_base + vec_offset;
206 T *y = y_base + vec_offset;
209 for (
size_t c = 0; c < dim; c++) {
210 sum +=
static_cast<T
>(A[row + c * dim]) *
static_cast<T
>(x[c]);
215template <
typename T,
typename S>
216__global__
void block_matvec_add_batched_kernel(
217 const S *values,
const HplMatVecOp *ops,
const size_t num_ops,
218 const T *x_base, T *y_base,
const size_t rows,
const size_t cols) {
219 const size_t idx = get_thread_id();
220 const size_t total_rows = num_ops * rows;
221 if (idx >= total_rows) {
225 const size_t op_id = idx / rows;
226 const size_t row = idx % rows;
227 const auto &op = ops[op_id];
229 const S *A = values + op.a_offset;
230 const T *x = x_base + op.x_offset;
231 T *y = y_base + op.y_offset;
234 for (
size_t c = 0; c < cols; c++) {
235 sum +=
static_cast<T
>(A[row + c * rows]) *
static_cast<T
>(x[c]);
237 atomicAdd(y + row, sum);
240template <
typename T,
typename S>
241__global__
void block_matvec_transpose_add_batched_kernel(
242 const S *values,
const HplMatVecOp *ops,
const size_t num_ops,
243 const T *x_base, T *y_base,
const size_t rows,
const size_t cols) {
244 const size_t idx = get_thread_id();
245 const size_t total_cols = num_ops * cols;
246 if (idx >= total_cols) {
250 const size_t op_id = idx / cols;
251 const size_t col = idx % cols;
252 const auto &op = ops[op_id];
254 const S *A = values + op.a_offset;
255 const T *x = x_base + op.x_offset;
256 T *y = y_base + op.y_offset;
259 for (
size_t r = 0; r < rows; r++) {
260 sum +=
static_cast<T
>(A[r + col * rows]) *
static_cast<T
>(x[r]);
262 atomicAdd(y + col, sum);
265template <
typename Src,
typename Dst = Src>
267block_copy_batched_kernel(
const Src *src_values, Dst *dst_values,
268 const BlockCopyOp *ops,
const size_t num_ops,
269 const size_t rows,
const size_t cols) {
270 const size_t idx = get_thread_id();
271 const size_t block_size = rows * cols;
272 const size_t total = num_ops * block_size;
277 const size_t op_id = idx / block_size;
278 const size_t local_idx = idx % block_size;
279 const auto &op = ops[op_id];
280 dst_values[op.dst_offset + local_idx] =
281 static_cast<Dst
>(src_values[op.src_offset + local_idx]);
The top-level namespace for Graphite.
Definition eigen_solver.cpp:4
Stores offsets for Hpl*Hll^(-1)*Hpl^T operation.
Definition schur.hpp:14