7__global__
void axpy_kernel(
size_t n, T *z,
const T a,
const T *x, T *y) {
9 static_cast<size_t>(blockIdx.x) *
static_cast<size_t>(blockDim.x) +
10 static_cast<size_t>(threadIdx.x);
12 z[idx] = a * x[idx] + y[idx];
17void axpy_async(cudaStream_t stream,
size_t n, T *z,
const T a,
const T *x,
19 size_t threads_per_block = 256;
20 size_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
21 axpy_kernel<T><<<num_blocks, threads_per_block, 0, stream>>>(n, z, a, x, y);
25__global__
void damping_kernel(
size_t n, T *z,
const T a,
const T *diag,
28 static_cast<size_t>(blockIdx.x) *
static_cast<size_t>(blockDim.x) +
29 static_cast<size_t>(threadIdx.x);
31 z[idx] += a * diag[idx] * x[idx];
36void damp_by_factor_async(cudaStream_t stream,
size_t n, T *z,
const T a,
37 const T *diag,
const T *x) {
38 size_t threads_per_block = 256;
39 size_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
41 <<<num_blocks, threads_per_block, 0, stream>>>(n, z, a, diag, x);
45__global__
void clamp_kernel(
size_t n, T min_val, T max_val, T *x) {
47 static_cast<size_t>(blockIdx.x) *
static_cast<size_t>(blockDim.x) +
48 static_cast<size_t>(threadIdx.x);
50 x[idx] = std::clamp(x[idx], min_val, max_val);
55void clamp_async(cudaStream_t stream,
size_t n, T min_val, T max_val, T *x) {
56 size_t threads_per_block = 256;
57 size_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
59 <<<num_blocks, threads_per_block, 0, stream>>>(n, min_val, max_val, x);
63__global__
void rescale_vec_kernel(
size_t n, T *out,
const T scale,
66 static_cast<size_t>(blockIdx.x) *
static_cast<size_t>(blockDim.x) +
67 static_cast<size_t>(threadIdx.x);
69 out[idx] = scale * x[idx];
74void rescale_vec_async(cudaStream_t stream,
size_t n, T *out,
const T scale,
76 size_t threads_per_block = 256;
77 size_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
79 <<<num_blocks, threads_per_block, 0, stream>>>(n, out, scale, x);