Graphite
Loading...
Searching...
No Matches
pcg.hpp
Go to the documentation of this file.
1
2#pragma once
6#include <thrust/execution_policy.h>
7#include <thrust/inner_product.h>
8
9namespace graphite {
10
12template <typename T, typename S> class PCGSolver : public Solver<T, S> {
13private:
14 thrust::device_vector<T> v;
15
16 // Need vectors for residuals of each factor
17 thrust::device_vector<T> v1; // v1 = Jv (dimension same as r)
18 thrust::device_vector<T> v2; // v2 = J^T v1 (dimension same as x)
19 thrust::device_vector<T> r; // residual
20 thrust::device_vector<T> p; // search direction
21 thrust::device_vector<T> z; // preconditioned residual
22 thrust::device_vector<T> diag; // diagonal of Hessian
23 thrust::device_vector<T> x_backup;
24 thrust::device_vector<T> y;
25
26 size_t max_iter;
27 T tol;
28 T rejection_ratio;
29 T damping_factor;
30
31 Preconditioner<T, S> *preconditioner;
32
33public:
34 PCGSolver(size_t max_iter, T tol, T rejection_ratio,
35 Preconditioner<T, S> *preconditioner)
36 : max_iter(max_iter), tol(tol), rejection_ratio(rejection_ratio),
37 damping_factor(0), preconditioner(preconditioner) {}
38
39 virtual void update_structure(Graph<T, S> *graph,
40 StreamPool &streams) override {
41
42 preconditioner->update_structure(graph, streams);
43 }
44
45 virtual void update_values(Graph<T, S> *graph, StreamPool &streams) override {
46 preconditioner->update_values(graph, streams);
47 }
48
49 virtual void set_damping_factor(Graph<T, S> *graph, T damping_factor,
50 StreamPool &streams) override {
51 this->damping_factor = damping_factor;
52 preconditioner->set_damping_factor(graph, damping_factor, streams);
53 }
54
55 // Assumes that x is already initialized
56 virtual bool solve(Graph<T, S> *graph, T *x, StreamPool &streams) override {
57
58 auto &vertex_descriptors = graph->get_vertex_descriptors();
59 auto &factor_descriptors = graph->get_factor_descriptors();
60 T *b = graph->get_b().data().get();
61 size_t dim_h = graph->get_hessian_dimension();
62
63 size_t dim_r = 0;
64 for (size_t i = 0; i < factor_descriptors.size(); i++) {
65 dim_r += factor_descriptors[i]->get_residual_size();
66 }
67
68 // Resize vectors (assuming this causes host synchronization)
69 v1.resize(dim_r);
70 v2.resize(dim_h);
71 r.resize(dim_h); // dim h because dim(r) = dim(Ax) = dim(b)
72 diag.resize(dim_h);
73
74 const cudaStream_t stream = 0;
75
76 thrust::fill(thrust::cuda::par_nosync.on(stream), x, x + dim_h, 0.0);
77 thrust::fill(thrust::cuda::par_nosync.on(stream), v1.begin(), v1.end(),
78 0.0);
79 thrust::fill(thrust::cuda::par_nosync.on(stream), v2.begin(), v2.end(),
80 0.0);
81
82 // Compute residual
83 thrust::copy(thrust::cuda::par_nosync.on(stream), graph->get_b().begin(),
84 graph->get_b().end(), r.begin());
85
86 // 3. Add damping factor
87 // v2 += damping_factor*diag(H)*x
88 thrust::fill(thrust::cuda::par_nosync.on(stream), diag.begin(), diag.end(),
89 0.0);
90 for (size_t i = 0; i < factor_descriptors.size(); i++) {
91 factor_descriptors[i]->compute_hessian_scalar_diagonal_async(
92 diag.data().get(),
93 graph->get_jacobian_scales().data().get()); // also on default stream
94 }
95
96 // Check for negative values in diag and print an error if found
97 T min_diag = static_cast<T>(1.0e-6);
98 T max_diag = static_cast<T>(1.0e32);
99 ops::clamp_async(stream, dim_h, min_diag, max_diag, diag.data().get());
100
101 cudaStreamSynchronize(stream);
102
103 // Rescale r
104 y.resize(dim_h);
105 auto rnorm = thrust::inner_product(thrust::device, r.begin(), r.end(),
106 r.begin(), static_cast<T>(0.0));
107 rnorm = std::sqrt(rnorm);
108 auto scale = 1.0 / rnorm;
109 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
110 r.data().get());
111 cudaStreamSynchronize(stream);
112 // Apply preconditioner
113 z.resize(dim_h);
114
115 thrust::fill(z.begin(), z.end(), 0.0);
116 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
117
118 p.resize(dim_h);
119 thrust::copy(z.begin(), z.end(), p.begin()); // p = z
120
121 x_backup.resize(dim_h);
122
123 // 1. First compute dot(r, z)
124 T rz = (T)thrust::inner_product(r.begin(), r.end(), z.begin(),
125 static_cast<T>(0.0));
126
127 T rz_0 = std::numeric_limits<T>::infinity();
128
129 for (size_t k = 0; k < max_iter; k++) {
130
131 if (rz == 0) {
132 // std::cout << "rz is zero, stopping at iteration " << k << std::endl;
133 break;
134 }
135
136 // auto t_jv_start = std::chrono::steady_clock::now();
137 // 2. Compute v1 = Jp
138 thrust::fill(v1.begin(), v1.end(), 0.0);
139 auto v1_ptr = v1.data().get(); // reset
140 for (size_t i = 0; i < factor_descriptors.size(); i++) {
141 factor_descriptors[i]->compute_Jv(
142 v1_ptr, p.data().get(), graph->get_jacobian_scales().data().get(),
143 streams);
144 v1_ptr += factor_descriptors[i]->get_residual_size();
145 }
146 // auto t_jv_end = std::chrono::steady_clock::now();
147 // std::cout << "Time for Jv: "
148 // << std::chrono::duration<double>(t_jv_end -
149 // t_jv_start).count()
150 // << " seconds" << std::endl;
151
152 // 3. Compute v2 = J^T v1
153 thrust::fill(v2.begin(), v2.end(), 0.0);
154 v1_ptr = v1.data().get(); // reset
155 for (size_t i = 0; i < factor_descriptors.size(); i++) {
156 factor_descriptors[i]->compute_Jtv(
157 v2.data().get(), v1_ptr, graph->get_jacobian_scales().data().get(),
158 streams);
159 v1_ptr += factor_descriptors[i]->get_residual_size();
160 }
161 // Add damping factor
162 // v2 += damping_factor*diag(H)*p
163 ops::damp_by_factor_async(stream, dim_h, v2.data().get(), damping_factor,
164 diag.data().get(), p.data().get());
165
166 // 4. Compute alpha = dot(r, z) / dot(p, v2)
167 T alpha = (rz) / thrust::inner_product(thrust::cuda::par.on(stream),
168 p.begin(), p.end(), v2.begin(),
169 static_cast<T>(0.0));
170 // 5. x += alpha * p
171 thrust::copy(thrust::cuda::par.on(stream), x, x + dim_h,
172 x_backup.begin());
173 ops::axpy_async(stream, dim_h, x, alpha, p.data().get(), x);
174
175 // 6. r -= alpha * v2
176 ops::axpy_async(stream, dim_h, r.data().get(), -alpha, v2.data().get(),
177 r.data().get());
178 // cudaStreamSynchronize(0);
179
180 rnorm = (T)thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
181 r.end(), r.begin(), static_cast<T>(0.0));
182 rnorm = std::sqrt(rnorm);
183 scale = 1.0 / rnorm;
184 ops::rescale_vec_async<T>(stream, dim_h, y.data().get(), scale,
185 r.data().get());
186
187 // Apply preconditioner again
188 thrust::fill(thrust::cuda::par.on(stream), z.begin(), z.end(), 0.0);
189 preconditioner->apply(graph, z.data().get(), y.data().get(), streams);
190 T rz_new = thrust::inner_product(thrust::cuda::par.on(stream), r.begin(),
191 r.end(), z.begin(), static_cast<T>(0.0));
192
193 // if (rz_new > rejection_ratio * rz_0) {
194 if (std::abs(rz_new) > rejection_ratio * rz_0 || std::isnan(rz_new)) {
195 thrust::copy(thrust::device, x_backup.begin(), x_backup.end(), x);
196 // std::cout << "Rejection: rz_new = " << rz_new
197 // << ", rz_0 = " << rz_0 << " at iteration " << k + 1 <<
198 // std::endl;
199 break;
200 }
201 rz_0 = std::min(rz_0, std::abs(rz_new));
202
203 // 8. Compute beta
204 // std::cout << "rz_new: " << rz_new << ", rz: " << rz
205 // << ", at iteration " << k + 1 << std::endl;
206
207 T beta = rz_new / rz;
208 rz = rz_new;
209
210 // 9. Update p
211 ops::axpy_async(stream, dim_h, p.data().get(), beta, p.data().get(),
212 z.data().get());
213 cudaStreamSynchronize(stream);
214
215 if (std::abs(static_cast<T>(rz_new)) < tol) {
216 // std::cout << "Converged after " << k + 1
217 // << " iterations with residual: " << rz_new << std::endl;
218 break;
219 }
220 // if (k == max_iter - 1) {
221 // std::cout << "Reached maximum iterations: " << max_iter
222 // << " with residual: " << rz_new << std::endl;
223 // }
224 }
225 // TODO: Figure out failure cases
226 return true;
227 }
228};
229} // namespace graphite
Preconditioned Conjugate Gradient (PCG) solver.
Definition pcg.hpp:12
Linear solver interface. Implement this for your own linear solvers.
Definition solver.hpp:12
Definition stream.hpp:7