diff --git a/coursework_report/gmres_MPI.cpp b/coursework_report/gmres_MPI.cpp new file mode 100644 index 0000000..9a2e4cd --- /dev/null +++ b/coursework_report/gmres_MPI.cpp @@ -0,0 +1,363 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +std::random_device rd; +std::mt19937 gen(rd()); +std::uniform_real_distribution<> floatDist(-1, 0); + +class Matvec +{ +private: + int matrix_rows; + MPI_Comm comm; + int rank, size; + +public: + int row, col; + std::vector local_A; + std::vector shifts_rcv; + std::vector counts_rcv; + std::vector shifts_snd; + std::vector counts_snd; + + Matvec(const int n_, const int m_, MPI_Comm comm) + { + MPI_Comm_rank(comm, &rank); + MPI_Comm_size(comm, &size); + + int local_n = n_ / size; // Число строк на каждом процессе + + if (rank == size - 1) + { + local_n += n_ % size; + } + + local_A.resize(local_n * m_); + shifts_rcv.resize(size); + shifts_snd.resize(size); + counts_rcv.resize(size); + counts_snd.resize(size); + + for (int i = 0; i < local_n * m_; i++) + { + local_A[i] = (-floatDist(gen)); + } + + row = local_n; + col = m_; + matrix_rows = n_; + + for (int i = 0; i < size; ++i) + { + counts_rcv[i] = (i != size - 1) ? n_ / size : n_ / size + n_ % size; + shifts_rcv[i] = (n_ / size) * i; + shifts_snd[i] = shifts_rcv[i] * col; + counts_snd[i] = counts_rcv[i] * col; + } + } + void append(std::vector &x) + { + col++; + local_A.resize(row * col); + + std::copy(x.begin(), x.end(), local_A.end() - x.size()); + } + + std::vector matvec(std::vector &x, CBLAS_ORDER order = CblasRowMajor) + { + std::vector local_result(row); + + int lda = (order == CblasRowMajor) ? col : row; + + cblas_dgemv(order, CblasNoTrans, row, col, 1.0, local_A.data(), lda, x.data(), 1, 1.0, local_result.data(), 1); + + std::vector result(matrix_rows); + + MPI_Gatherv(local_result.data(), row, MPI_DOUBLE, result.data(), counts_rcv.data(), shifts_rcv.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(result.data(), result.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + + return result; + } +}; + +std::vector Solve_Upper_Triangular(std::vector &R, std::vector &b, int n) +{ + std::vector x(n, 0); + + for (int i = n - 1; i >= 0; --i) + { + double sum = 0; + for (int j = i + 1; j < n; ++j) + { + sum += R[i * n + j] * x[j]; + } + x[i] = (b[i] - sum) / R[i * n + i]; + } + return x; +} + +double norm(std::vector &y) +{ + double norm_local = std::sqrt(std::accumulate(y.begin(), y.end(), 0.0, [](double acc, double x) + { return acc + x * x; })); + return norm_local; +} +double squares(std::vector &y) +{ + double norm_local = std::accumulate(y.begin(), y.end(), 0.0, [](double acc, double x) + { return acc + x * x; }); + return norm_local; +} + +std::vector slice(std::vector &y, int rows, int cols, int y_m) +{ + std::vector res(rows * cols); + + for (int i = 0; i < rows; i++) + { + std::copy(y.begin() + i * y_m, y.begin() + i * y_m + cols, res.begin() + i * cols); + } + + return res; +} + +// find rotation to a-> (a^2 +b ^2)^0.5, b -> 0 +void find_rot(double a, double b, double &c, double &s) +{ + // std::cout << a << " " << b << '\n'; + double mod = sqrt(a * a + b * b); + if (mod == 0) + { + c = 1; + s = 0; + } + else + { + s = -b / mod; + c = a / mod; + } +} + +// apply all rotates to last col of matrix A. +void apply_rot_to_last_col(double *A, size_t m, size_t stride, std::vector const &cs, std::vector const &ss) +{ + size_t cur_pos = m - 1, next_pos = m + (stride - 1); + for (int i = 0; i < cs.size(); ++i, cur_pos += stride, next_pos += stride) + { + double c = cs[i], s = ss[i]; + double a1 = c * A[cur_pos] - s * A[next_pos]; + double a2 = s * A[cur_pos] + c * A[next_pos]; + A[cur_pos] = a1, A[next_pos] = a2; + } +} +void apply_rot_b(double *A, size_t m, size_t stride, std::vector const &cs, std::vector const &ss) +{ + size_t cur_pos = 0, next_pos = cur_pos + 1; + for (int i = 0; i < cs.size(); ++i, cur_pos++, next_pos++) + { + double c = cs[i], s = ss[i]; + double a1 = c * A[cur_pos] - s * A[next_pos]; + double a2 = s * A[cur_pos] + c * A[next_pos]; + A[cur_pos] = a1, A[next_pos] = a2; + } +} + +std::vector GMRES(std::vector &b, Matvec &matvec, double atol = 1e-8, double rtol = 1e-6, int maxiter = 50, int proc_id = 0, int n_proc = 1) +{ + int size, rank; + MPI_Comm_rank(MPI_COMM_WORLD, &rank); + MPI_Comm_size(MPI_COMM_WORLD, &size); + + Matvec Q_(b.size(), 1, MPI_COMM_WORLD); + + auto norm_b = norm(b); // norm_b = norm(b) + + std::vector q(b.size()); + + std::transform(b.begin(), b.end(), q.begin(), [norm_b](double value) + { return value / norm_b; }); // q = b / norm_b + + MPI_Scatterv(q.data(), Q_.counts_snd.data(), Q_.shifts_snd.data(), MPI_DOUBLE, Q_.local_A.data(), q.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); // разбили Q по процессам. + + std::vector h((maxiter + 1) * maxiter, 0); // h = np.zeros((maxiter + 1, maxiter)) + + std::vector y; + std::vector x; + std::vector cos_rot, sin_rot; + + int n; + double r = norm_b; + + for (n = 0; n < maxiter; ++n) + { + MPI_Bcast(&r, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(q.data(), q.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + + auto v = matvec.matvec(q); // v = matvec(q) + + std::vector v_slice(v.begin() + Q_.shifts_snd[proc_id], v.begin() + Q_.shifts_snd[proc_id] + Q_.counts_snd[proc_id]); + + for (int j = 0; j <= n; ++j) + { + std::vector tmp(v_slice.size(), 0); + std::vector Q_j(Q_.local_A.begin() + Q_.row * j, Q_.local_A.begin() + Q_.row * (j + 1)); + + double local, global = 0; + + local = cblas_ddot(Q_j.size(), Q_j.data(), 1, v_slice.data(), 1); + + MPI_Allreduce(&local, &global, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + MPI_Bcast(&global, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + + h[j * maxiter + n] = global; // h[j, n] = np.dot(Q[:, j], v) + + std::transform(Q_j.begin(), Q_j.end(), tmp.begin(), [&global](double value) + { return value * global; }); // h[j, n] * Q[:, j] + + std::transform(v_slice.begin(), v_slice.end(), tmp.begin(), v_slice.begin(), + [](double a, double b) + { return a - b; }); // v -= h[j, n] * Q[:, j] + + MPI_Gatherv(v_slice.data(), v_slice.size(), MPI_DOUBLE, v.data(), Q_.counts_rcv.data(), Q_.shifts_rcv.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + // MPI_Bcast(v.data(), v.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD) + } + + // PARALLEL NORM OF v start + double h_n_1_n = 0; + double local = squares(v_slice); + MPI_Allreduce(&local, &h_n_1_n, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + + h_n_1_n = std::sqrt(h_n_1_n); + MPI_Bcast(&h_n_1_n, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + // PARALLEL NORM OF v end + + h[(n + 1) * maxiter + n] = h_n_1_n; + + std::vector q_slice(v_slice.size()); + + std::transform(v_slice.begin(), v_slice.end(), q_slice.begin(), [&h_n_1_n](double value) + { return value / h_n_1_n; }); + + Q_.append(q_slice); + + MPI_Gatherv(q_slice.data(), q_slice.size(), MPI_DOUBLE, q.data(), Q_.counts_rcv.data(), Q_.shifts_rcv.data(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + + if (proc_id == 0) + { + std::vector e1(n + 2, 0); // e1 = np.zeros(n+2) + e1[0] = norm_b; // e1[0] = norm_b + + // QR algo + apply_rot_to_last_col(h.data(), n + 1, maxiter, cos_rot, sin_rot); // apply all computed rotations to last h column + + double c, s; + find_rot(h[n * maxiter + n], h[(n + 1) * maxiter + n], c, s); // find new rotation to turn h in triangular matrix + + double temp1 = h[n * maxiter + n] * c - h[(n + 1) * maxiter + n] * s; + double temp2 = h[n * maxiter + n] * s + h[(n + 1) * maxiter + n] * c; + h[n * maxiter + n] = temp1; + h[(n + 1) * maxiter + n] = temp2; + + cos_rot.push_back(c), sin_rot.emplace_back(s); + + apply_rot_b(e1.data(), n + 2, 1, cos_rot, sin_rot); // apply all rotation to right side: Q.T @ b + // QR end + auto R = slice(h, n + 2, n + 1, maxiter); // h[:n+2, :n+1] + + int R_n = n + 2, R_m = n + 1; + + // // Lin solve start + auto R_for_lin = slice(R, n + 1, n + 1, n + 1); + r = std::abs(e1.back()); // r = np.abs(Qb[-1]) + + std::vector Qb_for_lin(n + 1); + std::copy(e1.begin(), e1.begin() + n + 1, Qb_for_lin.begin()); + + y = Solve_Upper_Triangular(R_for_lin, Qb_for_lin, n + 1); + // // // Lin solve end + } + + if (proc_id) + { + y.resize(n + 1); + } + + MPI_Bcast(y.data(), y.size(), MPI_DOUBLE, 0, MPI_COMM_WORLD); + MPI_Bcast(&r, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); + + if (r < atol + norm_b * rtol) + { + if (proc_id == 0) + { + std::cout << "r: " << r << " n: " << n << std::endl; + } + + Q_.local_A = std::vector(Q_.local_A.begin(), Q_.local_A.begin() + (n + 1) * Q_.row); + Q_.col = n + 1; + + x = Q_.matvec(y, CblasColMajor); + + return x; + } + } + + Q_.local_A = std::vector(Q_.local_A.begin(), Q_.local_A.begin() + (n + 1) * Q_.row); + Q_.col = n + 1; + + x = Q_.matvec(y, CblasColMajor); + + return x; +} + +int main(int argc, char **argv) +{ + int mpi_thread_prov; + const double t = 1e-8; + MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &mpi_thread_prov); + int proc_id, n_proc; + MPI_Comm_rank(MPI_COMM_WORLD, &proc_id); + MPI_Comm_size(MPI_COMM_WORLD, &n_proc); + + int n = 20; + + std::vector b(n); + + if (proc_id == 0) + { + for (auto &x : b) + { + x = (-floatDist(gen)); + } + } + MPI_Bcast(b.data(), n, MPI_DOUBLE, 0, MPI_COMM_WORLD); + + Matvec matrix(n, n, MPI_COMM_WORLD); + + auto res = GMRES(b, matrix, t, 1e-6, n, proc_id, n_proc); + + auto tmp = matrix.matvec(res); + + if (proc_id == 0) + { + std::transform(tmp.begin(), tmp.end(), b.begin(), tmp.begin(), [](double a, double b) + { return a - b; }); + std::cout << "||Ax - b|| = " << norm(tmp) << std::endl; + + // for (auto x : res) + // { + // std::cout << x << " "; + // } + // std::cout << std::endl; + } + + MPI_Finalize(); + + return 0; +} \ No newline at end of file