The mixed multigrid solver example.
This example depends on simple-solver.
This example shows how to use the mixed-precision multigrid solver.
In this example, we first read in a matrix from a file, then generate a right-hand side and an initial guess. The multigrid solver can mix different precision of MultigridLevel. The example features the generating time and runtime of the multigrid solver.
The commented program
const auto executor_string = argc >= 2 ? argv[1] : "reference";
static const version_info & get()
Returns an instance of version_info.
Definition version.hpp:139
Figure out where to run the code
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
0, gko::ReferenceExecutor::create());
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
const int mixed_int = argc >= 3 ? std::atoi(argv[2]) : 1;
const bool use_mixed = mixed_int != 0;
std::cout << "Using mixed precision? " << use_mixed << std::endl;
Read data
std::unique_ptr< MatrixType > read(StreamType &&is, MatrixArgs &&... args)
Reads a matrix stored in matrix market format from an input stream.
Definition mtx_io.hpp:159
Create RHS as 1 and initial guess as 0
auto host_x = vec::create(exec->get_master(),
gko::dim<2>(size, 1));
auto host_b = vec::create(exec->get_master(),
gko::dim<2>(size, 1));
for (auto i = 0; i < size; i++) {
host_x->at(i, 0) = 0.;
host_b->at(i, 0) = 1.;
}
auto x = vec::create(exec);
auto b = vec::create(exec);
x->copy_from(host_x);
b->copy_from(host_b);
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:90
A type representing the dimensions of a multidimensional object.
Definition dim.hpp:26
Calculate initial residual by overwriting b
A->apply(one, x, neg_one, b);
b->compute_norm2(initres);
std::unique_ptr< Matrix > initialize(size_type stride, std::initializer_list< typename Matrix::value_type > vals, std::shared_ptr< const Executor > exec, TArgs &&... create_args)
Creates and initializes a column-vector.
Definition dense.hpp:1574
copy b again
Prepare the stopping criteria
auto iter_stop =
gko::share(gko::stop::Iteration::build().with_max_iters(100u).on(exec));
auto tol_stop =
gko::share(gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::absolute)
.with_reduction_factor(tolerance)
.on(exec));
typename detail::remove_complex_s< T >::type remove_complex
Obtain the type which removed the complex of complex/scalar type or the template parameter of class b...
Definition math.hpp:264
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Marks the object pointed to by p as shared.
Definition utils_helper.hpp:224
Create smoother factory (ir with bj)
ir::build()
.with_solver(bj::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<ValueType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
.on(exec));
ir2::build()
.with_solver(bj2::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<MixedType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
.on(exec));
Create RestrictProlong factory
auto mg_level_gen =
gko::share(pgm::build().with_deterministic(
true).on(exec));
auto mg_level_gen2 =
gko::share(pgm2::build().with_deterministic(
true).on(exec));
Create CoarsesSolver factory
ir::build()
.with_solver(bj::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<ValueType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
.on(exec));
ir2::build()
.with_solver(bj2::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<MixedType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
.on(exec));
Create multigrid factory
std::shared_ptr<gko::LinOpFactory> multigrid_gen;
if (use_mixed) {
multigrid_gen =
mg::build()
.with_max_levels(10u)
.with_min_coarse_rows(2u)
.with_pre_smoother(smoother_gen, smoother_gen2)
.with_post_uses_pre(true)
.with_mg_level(mg_level_gen, mg_level_gen2)
Definition lin_op.hpp:117
The first (index 0) level will use the first mg_level_gen, smoother_gen which are the factories with ValueType. The rest of levels (>= 1) will use the second (index 1) mg_level_gen2 and smoother_gen2 which use the MixedType. The rest of levels will use different type than the normal multigrid.
return level >= 1 ? 1 : 0;
})
.with_coarsest_solver(coarsest_solver_gen2)
.with_criteria(iter_stop, tol_stop)
.on(exec);
} else {
multigrid_gen = mg::build()
.with_max_levels(10u)
.with_min_coarse_rows(2u)
.with_pre_smoother(smoother_gen)
.with_post_uses_pre(true)
.with_mg_level(mg_level_gen)
.with_coarsest_solver(coarsest_solver_gen)
.with_criteria(iter_stop, tol_stop)
.on(exec);
}
std::chrono::nanoseconds gen_time(0);
auto gen_tic = std::chrono::steady_clock::now();
auto solver = solver_gen->generate(A);
auto solver = multigrid_gen->generate(A);
exec->synchronize();
auto gen_toc = std::chrono::steady_clock::now();
gen_time +=
std::chrono::duration_cast<std::chrono::nanoseconds>(gen_toc - gen_tic);
Add logger
std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
solver->add_logger(logger);
static std::unique_ptr< Convergence > create(std::shared_ptr< const Executor >, const mask_type &enabled_events=Logger::criterion_events_mask|Logger::iteration_complete_mask)
Creates a convergence logger.
Definition convergence.hpp:73
Solve system
exec->synchronize();
std::chrono::nanoseconds time(0);
auto tic = std::chrono::steady_clock::now();
solver->apply(b, x);
exec->synchronize();
auto toc = std::chrono::steady_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
Calculate residual explicitly, because the residual is not available inside of the multigrid solver
A->apply(one, x, neg_one, b);
b->compute_norm2(res);
std::cout << "Initial residual norm sqrt(r^T r): \n";
write(std::cout, initres);
std::cout << "Final residual norm sqrt(r^T r): \n";
write(std::cout, res);
Print solver statistics
std::cout << "Multigrid iteration count: "
<< logger->get_num_iterations() << std::endl;
std::cout << "Multigrid generation time [ms]: "
<< static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
std::cout << "Multigrid execution time [ms]: "
<< static_cast<double>(time.count()) / 1000000.0 << std::endl;
std::cout << "Multigrid execution time per iteration[ms]: "
<< static_cast<double>(time.count()) / 1000000.0 /
logger->get_num_iterations()
<< std::endl;
}
Results
This is the expected output:
Initial residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
4.3589
Final residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
6.31088e-14
Multigrid iteration count: 9
Multigrid generation time [ms]: 3.35361
Multigrid execution time [ms]: 10.048
Multigrid execution time per iteration[ms]: 1.11644
Comments about programming and debugging
The plain program
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
#include <ginkgo/ginkgo.hpp>
int main(int argc, char* argv[])
{
using ValueType = double;
using MixedType = float;
using IndexType = int;
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
0, gko::ReferenceExecutor::create());
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
const int mixed_int = argc >= 3 ? std::atoi(argv[2]) : 1;
const bool use_mixed = mixed_int != 0;
std::cout << "Using mixed precision? " << use_mixed << std::endl;
auto host_x = vec::create(exec->get_master(),
gko::dim<2>(size, 1));
auto host_b = vec::create(exec->get_master(),
gko::dim<2>(size, 1));
for (auto i = 0; i < size; i++) {
host_x->at(i, 0) = 0.;
host_b->at(i, 0) = 1.;
}
auto x = vec::create(exec);
auto b = vec::create(exec);
x->copy_from(host_x);
b->copy_from(host_b);
A->apply(one, x, neg_one, b);
b->compute_norm2(initres);
b->copy_from(host_b);
auto iter_stop =
gko::share(gko::stop::Iteration::build().with_max_iters(100u).on(exec));
auto tol_stop =
gko::share(gko::stop::ResidualNorm<ValueType>::build()
.with_baseline(gko::stop::mode::absolute)
.with_reduction_factor(tolerance)
.on(exec));
ir::build()
.with_solver(bj::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<ValueType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
.on(exec));
ir2::build()
.with_solver(bj2::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<MixedType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(1u))
.on(exec));
auto mg_level_gen =
gko::share(pgm::build().with_deterministic(
true).on(exec));
auto mg_level_gen2 =
gko::share(pgm2::build().with_deterministic(
true).on(exec));
ir::build()
.with_solver(bj::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<ValueType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
.on(exec));
ir2::build()
.with_solver(bj2::build().with_max_block_size(1u))
.with_relaxation_factor(static_cast<MixedType>(0.9))
.with_criteria(gko::stop::Iteration::build().with_max_iters(4u))
.on(exec));
std::shared_ptr<gko::LinOpFactory> multigrid_gen;
if (use_mixed) {
multigrid_gen =
mg::build()
.with_max_levels(10u)
.with_min_coarse_rows(2u)
.with_pre_smoother(smoother_gen, smoother_gen2)
.with_post_uses_pre(true)
.with_mg_level(mg_level_gen, mg_level_gen2)
return level >= 1 ? 1 : 0;
})
.with_coarsest_solver(coarsest_solver_gen2)
.with_criteria(iter_stop, tol_stop)
.on(exec);
} else {
multigrid_gen = mg::build()
.with_max_levels(10u)
.with_min_coarse_rows(2u)
.with_pre_smoother(smoother_gen)
.with_post_uses_pre(true)
.with_mg_level(mg_level_gen)
.with_coarsest_solver(coarsest_solver_gen)
.with_criteria(iter_stop, tol_stop)
.on(exec);
}
std::chrono::nanoseconds gen_time(0);
auto gen_tic = std::chrono::steady_clock::now();
auto solver = multigrid_gen->generate(A);
exec->synchronize();
auto gen_toc = std::chrono::steady_clock::now();
gen_time +=
std::chrono::duration_cast<std::chrono::nanoseconds>(gen_toc - gen_tic);
std::shared_ptr<const gko::log::Convergence<ValueType>> logger =
exec->synchronize();
std::chrono::nanoseconds time(0);
auto tic = std::chrono::steady_clock::now();
exec->synchronize();
auto toc = std::chrono::steady_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
A->apply(one, x, neg_one, b);
b->compute_norm2(res);
std::cout << "Initial residual norm sqrt(r^T r): \n";
write(std::cout, initres);
std::cout << "Final residual norm sqrt(r^T r): \n";
std::cout << "Multigrid iteration count: "
<< logger->get_num_iterations() << std::endl;
std::cout << "Multigrid generation time [ms]: "
<< static_cast<double>(gen_time.count()) / 1000000.0 << std::endl;
std::cout << "Multigrid execution time [ms]: "
<< static_cast<double>(time.count()) / 1000000.0 << std::endl;
std::cout << "Multigrid execution time per iteration[ms]: "
<< static_cast<double>(time.count()) / 1000000.0 /
logger->get_num_iterations()
<< std::endl;
}
CSR is a matrix format which stores only the nonzero coefficients by compressing each row of the matr...
Definition csr.hpp:126
Dense is a matrix format which explicitly stores all values of the matrix.
Definition dense.hpp:120
Parallel graph match (Pgm) is the aggregate method introduced in the paper M.
Definition pgm.hpp:52
A block-Jacobi preconditioner is a block-diagonal linear operator, obtained by inverting the diagonal...
Definition jacobi.hpp:190
CG or the conjugate gradient method is an iterative type Krylov subspace method which is suitable for...
Definition cg.hpp:50
FCG or the flexible conjugate gradient method is an iterative type Krylov subspace method which is su...
Definition fcg.hpp:55
Iterative refinement (IR) is an iterative method that uses another coarse method to approximate the e...
Definition ir.hpp:85
Multigrid methods have a hierarchy of many levels, whose corase level is a subset of the fine level,...
Definition multigrid.hpp:110
@ solver
Solver events.
Definition profiler_hook.hpp:34
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:654
void write(StreamType &&os, MatrixPtrType &&matrix, layout_type layout=detail::mtx_io_traits< std::remove_cv_t< detail::pointee< MatrixPtrType > > >::default_layout)
Writes a matrix into an output stream in matrix market format.
Definition mtx_io.hpp:295