The simple solver with performance debugging example.
This example depends on simple-solver-logging, minimal-cuda-solver.
Introduction
About the example
This example runs a solver on a test problem and shows how to use loggers to debug performance and convergence rate.
The commented program
creates a zero vector
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
auto res = vec<ValueType>::create(exec);
return res;
}
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:90
A type representing the dimensions of a multidimensional object.
Definition dim.hpp:26
This structure is used as an intermediate data type to store a sparse matrix.
Definition matrix_data.hpp:126
utilities for computing norms and residuals
template <typename ValueType>
ValueType get_first_element(const vec<ValueType>* norm)
{
return norm->get_executor()->copy_val_to_host(norm->get_const_values());
}
template <typename ValueType>
{
auto exec = b->get_executor();
b->compute_norm2(b_norm);
return get_first_element(b_norm.get());
}
template <typename ValueType>
const gko::LinOp* system_matrix,
const vec<ValueType>* b,
const vec<ValueType>* x)
{
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
Definition lin_op.hpp:117
LinOp * apply(ptr_param< const LinOp > b, ptr_param< LinOp > x)
Applies a linear operator to a vector (or a sequence of vectors).
Definition lin_op.hpp:129
std::shared_ptr< const Executor > get_executor() const noexcept
Returns the Executor of the object.
Definition polymorphic_object.hpp:243
std::unique_ptr< Matrix > initialize(size_type stride, std::initializer_list< typename Matrix::value_type > vals, std::shared_ptr< const Executor > exec, TArgs &&... create_args)
Creates and initializes a column-vector.
Definition dense.hpp:1574
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:654
typename detail::remove_complex_s< T >::type remove_complex
Obtain the type which removed the complex of complex/scalar type or the template parameter of class b...
Definition math.hpp:264
detail::cloned_type< Pointer > clone(const Pointer &p)
Creates a unique clone of the object pointed to by p.
Definition utils_helper.hpp:173
A logger that accumulates the time of all operations. For each operation type (allocations, free, copy, internal operations i.e. kernels), the timing is taken before and after. This can create significant overhead since to ensure proper timings, calls to synchronize are required.
{
this->start_operation(exec, "allocate");
}
{
this->end_operation(exec, "allocate");
}
void on_free_started(const gko::Executor* exec,
{
this->start_operation(exec, "free");
}
void on_free_completed(const gko::Executor* exec,
{
this->end_operation(exec, "free");
}
void on_copy_started(const gko::Executor* from, const gko::Executor* to,
{
this->start_operation(to, "copy");
}
void on_copy_completed(const gko::Executor* from, const gko::Executor* to,
{
this->end_operation(to, "copy");
}
void on_operation_launched(const gko::Executor* exec,
const gko::Operation* op) const override
{
this->start_operation(exec, op->
get_name());
}
void on_operation_completed(const gko::Executor* exec,
const gko::Operation* op) const override
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
The first step in using the Ginkgo library consists of creating an executor.
Definition executor.hpp:615
virtual void synchronize() const =0
Synchronize the operations launched on the executor with its master.
virtual const char * get_name() const noexcept
Returns the operation's name.
std::uintptr_t uintptr
Unsigned integer type capable of holding a pointer to void.
Definition types.hpp:142
Helper which synchronizes and starts the time before every operation.
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
Helper to compute the end time and store the operation's time at its end. Also time nested operations.
void end_operation(
const gko::Executor* exec,
const std::string& name)
const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
make sure timings for nested operations are not counted twice
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
the position i of this vector holds the total time spend on child operations on nesting level i
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
This logger tracks the persistently allocated data
Store amount of bytes allocated on every allocation
{
storage[location] = num_bytes;
}
Reset the amount of bytes on every free
{
storage[location] = 0;
}
Write the data after summing the total from all allocations
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
Logs true and recurrent residuals of the solver
template <typename ValueType>
Depending on the available information, store the norm or compute it from the residual. If the true residual norm could not be computed, store the value -1.0.
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
gko::as<real_vec<ValueType>>(residual_norm)));
} else {
rec_res_norms.push_back(
utils::compute_norm(
gko::as<vec<ValueType>>(residual)));
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
matrix, b,
gko::as<vec<ValueType>>(solution)));
} else {
true_res_norms.push_back(-1.0);
}
}
ResidualLogger(
const gko::LinOp* matrix,
const vec<ValueType>* b)
:
gko::log::Logger(
gko::log::Logger::iteration_complete_mask),
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
const vec<ValueType>* b;
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::decay_t< T > * as(U *obj)
Performs polymorphic type conversion.
Definition utils_helper.hpp:307
Print usage help
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" << vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
const dim< 2 > & get_size() const noexcept
Returns the size of the operator.
Definition lin_op.hpp:210
Dense is a matrix format which explicitly stores all values of the matrix.
Definition dense.hpp:120
value_type & at(size_type row, size_type col) noexcept
Returns a single element of the matrix.
Definition dense.hpp:901
Parametrize the benchmark here Pick a value type
using ValueType = double;
using IndexType = int;
Pick a matrix format
CSR is a matrix format which stores only the nonzero coefficients by compressing each row of the matr...
Definition csr.hpp:126
Pick a solver
CG or the conjugate gradient method is an iterative type Krylov subspace method which is suitable for...
Definition cg.hpp:50
Pick a preconditioner type
This factory is a utility which can be used to generate Identity operators.
Definition identity.hpp:90
Pick a residual norm reduction value
Pick an output file name
const auto of_name = "log.txt";
Simple shortcut
Print version information
static const version_info & get()
Returns an instance of version_info.
Definition version.hpp:139
Figure out where to run the code
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
Figure out where to run the code
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
Read the input matrix file directory
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
Read data: A is read from disk Create a StorageLogger to track the size of A
auto storage_logger = std::make_shared<loggers::StorageLogger>();
Add the logger to the executor
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition executor.hpp:838
Read the matrix A from file
std::unique_ptr< MatrixType > read(StreamType &&is, MatrixArgs &&... args)
Reads a matrix stored in matrix market format from an input stream.
Definition mtx_io.hpp:159
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Marks the object pointed to by p as shared.
Definition utils_helper.hpp:224
Remove the storage logger
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:851
Pick a maximum iteration count
const auto max_iters = A->get_size()[0];
Generate b and x vectors
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
Declare the solver factory. The preconditioner's arguments should be adapted if needed.
auto solver_factory =
solver::build()
.with_criteria(
gko::stop::ResidualNorm<ValueType>::build()
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
Declare the output file for all our loggers
std::ofstream output_file(of_name);
Do a warmup run
Clone x to not overwrite the original one
Generate and call apply on a solver
solver_factory->generate(A)->apply(b, x_clone);
}
Do a timed run
Clone x to not overwrite the original one
Synchronize ensures no operation are ongoing
Time before generate
auto g_tic = std::chrono::steady_clock::now();
Generate a solver
auto generated_solver = solver_factory->generate(A);
Time after generate
auto g_tac = std::chrono::steady_clock::now();
Compute the generation time
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
Write the generate time to the output file
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
Similarly time the apply
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
Compute the residual norm
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
Log the internal operations using the OperationLogger without timing
Create an OperationLogger to analyze the generate step
auto gen_logger = std::make_shared<loggers::OperationLogger>();
Add the generate logger to the executor
Generate a solver
auto generated_solver = solver_factory->generate(A);
Remove the generate logger from the executor
Write the data to the output file
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
Create an OperationLogger to analyze the apply step
auto apply_logger = std::make_shared<loggers::OperationLogger>();
Create a ResidualLogger to log the recurent residual
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
Solve the system
generated_solver->apply(b, x);
Write the data to the output file
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
Print solution
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
Print output file location
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}
Results
This is the expected standard output:
Solution, first ten entries:
[
0.252218
0.108645
0.0662811
0.0630433
0.0384088
0.0396536
0.0402648
0.0338935
0.0193098
0.0234653
];
The performance and residual data can be found in log.txt
Here is a sample output in the file log.txt:
Generate time (ns): 861
Apply time (ns): 108144
Residual_norm: 2.10788e-15
Generate operations times (ns):
Apply operations times (ns):
allocate: 14991
cg::initialize#8: 872
cg::step_1#5: 7683
cg::step_2#7: 7756
copy: 7751
csr::advanced_spmv#5: 21819
csr::spmv#3: 20429
dense::compute_dot#3: 18043
dense::compute_norm2#2: 16726
free: 8857
residual_norm::residual_norm#9: 3614
Recurrent Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
1.64195e-15
];
True Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
2.10788e-15
];
Comments about programming and debugging
The plain program
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <ostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <ginkgo/ginkgo.hpp>
template <typename ValueType>
template <typename ValueType>
namespace utils {
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
auto res = vec<ValueType>::create(exec);
return res;
}
template <typename ValueType>
ValueType get_first_element(const vec<ValueType>* norm)
{
return norm->get_executor()->copy_val_to_host(norm->get_const_values());
}
template <typename ValueType>
{
auto exec = b->get_executor();
b->compute_norm2(b_norm);
return get_first_element(b_norm.get());
}
template <typename ValueType>
const gko::LinOp* system_matrix, const vec<ValueType>* b,
const vec<ValueType>* x)
{
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
struct OperationLogger : gko::log::Logger {
void on_allocation_started(const gko::Executor* exec,
{
this->start_operation(exec, "allocate");
}
void on_allocation_completed(const gko::Executor* exec,
{
this->end_operation(exec, "allocate");
}
void on_free_started(const gko::Executor* exec,
{
this->start_operation(exec, "free");
}
void on_free_completed(const gko::Executor* exec,
{
this->end_operation(exec, "free");
}
void on_copy_started(const gko::Executor* from, const gko::Executor* to,
{
this->start_operation(to, "copy");
}
void on_copy_completed(const gko::Executor* from, const gko::Executor* to,
{
this->end_operation(to, "copy");
}
void on_operation_launched(const gko::Executor* exec,
const gko::Operation* op) const override
{
this->start_operation(exec, op->
get_name());
}
void on_operation_completed(const gko::Executor* exec,
const gko::Operation* op) const override
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
void start_operation(const gko::Executor* exec,
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
void end_operation(const gko::Executor* exec, const std::string& name) const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
struct StorageLogger : gko::log::Logger {
void on_allocation_completed(const gko::Executor*,
{
storage[location] = num_bytes;
}
void on_free_completed(const gko::Executor*,
{
storage[location] = 0;
}
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
template <typename ValueType>
struct ResidualLogger : gko::log::Logger {
const gko::LinOp* residual,
const gko::LinOp* solution,
const gko::LinOp* residual_norm) const override
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
gko::as<real_vec<ValueType>>(residual_norm)));
} else {
rec_res_norms.push_back(
utils::compute_norm(
gko::as<vec<ValueType>>(residual)));
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
matrix, b,
gko::as<vec<ValueType>>(solution)));
} else {
true_res_norms.push_back(-1.0);
}
}
ResidualLogger(const gko::LinOp* matrix, const vec<ValueType>* b)
: gko::log::Logger(gko::log::Logger::iteration_complete_mask),
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
const gko::LinOp* matrix;
const vec<ValueType>* b;
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" << vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
using ValueType = double;
using IndexType = int;
const auto of_name = "log.txt";
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
auto storage_logger = std::make_shared<loggers::StorageLogger>();
const auto max_iters = A->get_size()[0];
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
auto solver_factory =
solver::build()
.with_criteria(
gko::stop::ResidualNorm<ValueType>::build()
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
std::ofstream output_file(of_name);
{
solver_factory->generate(A)->apply(b, x_clone);
}
{
auto g_tic = std::chrono::steady_clock::now();
auto generated_solver = solver_factory->generate(A);
auto g_tac = std::chrono::steady_clock::now();
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
{
auto gen_logger = std::make_shared<loggers::OperationLogger>();
auto generated_solver = solver_factory->generate(A);
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
auto apply_logger = std::make_shared<loggers::OperationLogger>();
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
generated_solver->apply(b, x);
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}
@ solver
Solver events.
Definition profiler_hook.hpp:34