14 thrust::device_vector<T> v;
17 thrust::device_vector<T> v1;
18 thrust::device_vector<T> v2;
19 thrust::device_vector<T> r;
20 thrust::device_vector<T> p;
21 thrust::device_vector<T> z;
22 thrust::device_vector<T> diag;
23 thrust::device_vector<T> x_backup;
24 thrust::device_vector<T> y;
30 bool use_identity_damping;
32 Preconditioner<T, S> *preconditioner;
35 PCGSolver(
size_t max_iter, T tol, T rejection_ratio,
36 Preconditioner<T, S> *preconditioner)
37 : max_iter(max_iter), tol(tol), rejection_ratio(rejection_ratio),
38 damping_factor(0), use_identity_damping(
false),
39 preconditioner(preconditioner) {}
41 virtual void update_structure(Graph<T, S> *graph,
44 preconditioner->update_structure(graph, streams);
47 virtual void update_values(Graph<T, S> *graph,
StreamPool &streams)
override {
48 preconditioner->update_values(graph, streams);
51 virtual void set_damping_factor(Graph<T, S> *graph, T damping_factor,
52 const bool use_identity,
54 this->damping_factor = damping_factor;
55 this->use_identity_damping = use_identity;
56 preconditioner->set_damping_factor(graph, damping_factor, use_identity,
61 virtual bool solve(Graph<T, S> *graph, T *x,
StreamPool &streams)
override {
63 auto &vertex_descriptors = graph->get_vertex_descriptors();
64 auto &factor_descriptors = graph->get_factor_descriptors();
65 T *b = graph->get_b().data().get();
66 size_t dim_h = graph->get_hessian_dimension();
69 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
70 dim_r += factor_descriptors[i]->get_residual_size();
79 const cudaStream_t stream = 0;
81 thrust::fill(thrust::cuda::par_nosync.on(stream), x, x + dim_h, 0.0);
82 thrust::fill(thrust::cuda::par_nosync.on(stream), v1.begin(), v1.end(),
84 thrust::fill(thrust::cuda::par_nosync.on(stream), v2.begin(), v2.end(),
88 thrust::copy(thrust::cuda::par_nosync.on(stream), graph->get_b().begin(),
89 graph->get_b().end(), r.begin());
93 thrust::fill(thrust::cuda::par_nosync.on(stream), diag.begin(), diag.end(),
95 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
96 factor_descriptors[i]->compute_hessian_scalar_diagonal_async(
98 graph->get_jacobian_scales().data().get());
102 T min_diag =
static_cast<T
>(1.0e-6);
103 T max_diag =
static_cast<T
>(1.0e32);
104 ops::clamp_async(stream, dim_h, min_diag, max_diag, diag.data().get());
106 cudaStreamSynchronize(stream);
110 auto rnorm = thrust::inner_product(thrust::device, r.begin(), r.end(),
111 r.begin(),
static_cast<T
>(0.0));
112 rnorm = std::sqrt(rnorm);
113 auto scale = 1.0 / rnorm;
114 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
116 cudaStreamSynchronize(stream);
120 thrust::fill(z.begin(), z.end(), 0.0);
121 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
124 thrust::copy(z.begin(), z.end(), p.begin());
126 x_backup.resize(dim_h);
129 T rz = (T)thrust::inner_product(r.begin(), r.end(), z.begin(),
130 static_cast<T
>(0.0));
132 T rz_0 = std::numeric_limits<T>::infinity();
133 for (
size_t k = 0; k < max_iter; k++) {
141 thrust::fill(v1.begin(), v1.end(), 0.0);
142 auto v1_ptr = v1.data().get();
143 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
144 factor_descriptors[i]->compute_Jv(
145 v1_ptr, p.data().get(), graph->get_jacobian_scales().data().get(),
147 v1_ptr += factor_descriptors[i]->get_residual_size();
156 thrust::fill(v2.begin(), v2.end(), 0.0);
157 v1_ptr = v1.data().get();
158 for (
size_t i = 0; i < factor_descriptors.size(); i++) {
159 factor_descriptors[i]->compute_Jtv(
160 v2.data().get(), v1_ptr, graph->get_jacobian_scales().data().get(),
162 v1_ptr += factor_descriptors[i]->get_residual_size();
166 ops::damp_by_factor_async(stream, dim_h, v2.data().get(), damping_factor,
167 use_identity_damping, diag.data().get(),
171 T alpha = (rz) / thrust::inner_product(thrust::cuda::par.on(stream),
172 p.begin(), p.end(), v2.begin(),
173 static_cast<T
>(0.0));
175 thrust::copy(thrust::cuda::par.on(stream), x, x + dim_h,
177 ops::axpy_async(stream, dim_h, x, alpha, p.data().get(), x);
180 ops::axpy_async(stream, dim_h, r.data().get(), -alpha, v2.data().get(),
184 rnorm = (T)thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
185 r.end(), r.begin(),
static_cast<T
>(0.0));
186 rnorm = std::sqrt(rnorm);
188 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
192 thrust::fill(thrust::cuda::par.on(stream), z.begin(), z.end(), 0.0);
193 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
194 T rz_new = thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
195 r.end(), z.begin(),
static_cast<T
>(0.0));
198 if (std::abs(rz_new) > rejection_ratio * rz_0 || std::isnan(rz_new)) {
199 thrust::copy(thrust::device, x_backup.begin(), x_backup.end(), x);
203 std::cout <<
"rejected pcg update\n";
206 rz_0 = std::min(rz_0, std::abs(rz_new));
212 T beta = rz_new / (rz);
216 ops::axpy_async(stream, dim_h, p.data().get(), beta, p.data().get(),
218 cudaStreamSynchronize(stream);
220 if (std::abs(
static_cast<T
>(rz_new)) < tol) {