14 thrust::device_vector<T> v;
17 thrust::device_vector<T> v1;
18 thrust::device_vector<T> v2;
19 thrust::device_vector<T> r;
20 thrust::device_vector<T> p;
21 thrust::device_vector<T> z;
22 thrust::device_vector<T> diag;
23 thrust::device_vector<T> x_backup;
24 thrust::device_vector<T> y;
31 Preconditioner<T, S> *preconditioner;
34 PCGSolver(
size_t max_iter, T tol, T rejection_ratio,
35 Preconditioner<T, S> *preconditioner)
36 : max_iter(max_iter), tol(tol), rejection_ratio(rejection_ratio),
37 damping_factor(0), preconditioner(preconditioner) {}
39 virtual void update_structure(Graph<T, S> *graph,
42 preconditioner->update_structure(graph, streams);
45 virtual void update_values(Graph<T, S> *graph,
StreamPool &streams)
override {
46 preconditioner->update_values(graph, streams);
49 virtual void set_damping_factor(Graph<T, S> *graph, T damping_factor,
51 this->damping_factor = damping_factor;
52 preconditioner->set_damping_factor(graph, damping_factor, streams);
56 virtual bool solve(Graph<T, S> *graph, T *x,
StreamPool &streams)
override {
58 auto &vertex_descriptors = graph->get_vertex_descriptors();
59 auto &factor_descriptors = graph->get_factor_descriptors();
60 T *b = graph->get_b().data().get();
61 size_t dim_h = graph->get_hessian_dimension();
64 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
65 dim_r += factor_descriptors[i]->get_residual_size();
74 const cudaStream_t stream = 0;
76 thrust::fill(thrust::cuda::par_nosync.on(stream), x, x + dim_h, 0.0);
77 thrust::fill(thrust::cuda::par_nosync.on(stream), v1.begin(), v1.end(),
79 thrust::fill(thrust::cuda::par_nosync.on(stream), v2.begin(), v2.end(),
83 thrust::copy(thrust::cuda::par_nosync.on(stream), graph->get_b().begin(),
84 graph->get_b().end(), r.begin());
88 thrust::fill(thrust::cuda::par_nosync.on(stream), diag.begin(), diag.end(),
90 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
91 factor_descriptors[i]->compute_hessian_scalar_diagonal_async(
93 graph->get_jacobian_scales().data().get());
97 T min_diag =
static_cast<T
>(1.0e-6);
98 T max_diag =
static_cast<T
>(1.0e32);
99 ops::clamp_async(stream, dim_h, min_diag, max_diag, diag.data().get());
101 cudaStreamSynchronize(stream);
105 auto rnorm = thrust::inner_product(thrust::device, r.begin(), r.end(),
106 r.begin(),
static_cast<T
>(0.0));
107 rnorm = std::sqrt(rnorm);
108 auto scale = 1.0 / rnorm;
109 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
111 cudaStreamSynchronize(stream);
115 thrust::fill(z.begin(), z.end(), 0.0);
116 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
119 thrust::copy(z.begin(), z.end(), p.begin());
121 x_backup.resize(dim_h);
124 T rz = (T)thrust::inner_product(r.begin(), r.end(), z.begin(),
125 static_cast<T
>(0.0));
127 T rz_0 = std::numeric_limits<T>::infinity();
129 for (
size_t k = 0; k < max_iter; k++) {
138 thrust::fill(v1.begin(), v1.end(), 0.0);
139 auto v1_ptr = v1.data().get();
140 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
141 factor_descriptors[i]->compute_Jv(
142 v1_ptr, p.data().get(), graph->get_jacobian_scales().data().get(),
144 v1_ptr += factor_descriptors[i]->get_residual_size();
153 thrust::fill(v2.begin(), v2.end(), 0.0);
154 v1_ptr = v1.data().get();
155 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
156 factor_descriptors[i]->compute_Jtv(
157 v2.data().get(), v1_ptr, graph->get_jacobian_scales().data().get(),
159 v1_ptr += factor_descriptors[i]->get_residual_size();
163 ops::damp_by_factor_async(stream, dim_h, v2.data().get(), damping_factor,
164 diag.data().get(), p.data().get());
167 T alpha = (rz) / thrust::inner_product(thrust::cuda::par.on(stream),
168 p.begin(), p.end(), v2.begin(),
169 static_cast<T
>(0.0));
171 thrust::copy(thrust::cuda::par.on(stream), x, x + dim_h,
173 ops::axpy_async(stream, dim_h, x, alpha, p.data().get(), x);
176 ops::axpy_async(stream, dim_h, r.data().get(), -alpha, v2.data().get(),
180 rnorm = (T)thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
181 r.end(), r.begin(),
static_cast<T
>(0.0));
182 rnorm = std::sqrt(rnorm);
184 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
188 thrust::fill(thrust::cuda::par.on(stream), z.begin(), z.end(), 0.0);
189 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
190 T rz_new = thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
191 r.end(), z.begin(),
static_cast<T
>(0.0));
194 if (std::abs(rz_new) > rejection_ratio * rz_0 || std::isnan(rz_new)) {
195 thrust::copy(thrust::device, x_backup.begin(), x_backup.end(), x);
201 rz_0 = std::min(rz_0, std::abs(rz_new));
207 T beta = rz_new / rz;
211 ops::axpy_async(stream, dim_h, p.data().get(), beta, p.data().get(),
213 cudaStreamSynchronize(stream);
215 if (std::abs(
static_cast<T
>(rz_new)) < tol) {