The Mixed Precision Iterative Refinement (MPIR) solver example.
This example depends on iterative-refinement.
 
  This example manually implements a Mixed Precision Iterative Refinement (MPIR) solver.
In this example, we first read in a matrix from file, then generate a right-hand side and an initial guess. An inaccurate CG solver in single precision is used as the inner solver to an iterative refinement (IR) in double precision method which solves a linear system. 
 
The commented program
Print version information
static const version_info & get()
Returns an instance of version_info.
Definition version.hpp:139
 Figure out where to run the code
if (argc == 2 && (std::string(argv[1]) == "--help")) {
    std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
    std::exit(-1);
}
 
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
    exec_map{
        {"cuda",
         [] {
         }},
        {"hip",
         [] {
         }},
        {"dpcpp",
         [] {
                 0, gko::ReferenceExecutor::create());
         }},
        {"reference", [] { return gko::ReferenceExecutor::create(); }}};
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
Creates a new CudaExecutor.
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
Creates a new DpcppExecutor.
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
Creates a new HipExecutor.
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Creates a new OmpExecutor.
Definition executor.hpp:1396
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();  
Read data
std::unique_ptr< MatrixType > read(StreamType &&is, MatrixArgs &&... args)
Reads a matrix stored in matrix market format from an input stream.
Definition mtx_io.hpp:159
 Create RHS and initial guess as 1
auto host_x = vec::create(exec->get_master(), 
gko::dim<2>(size, 1));
 
for (auto i = 0; i < size; i++) {
    host_x->at(i, 0) = 1.;
}
std::size_t size_type
Integral type used for allocation quantities.
Definition types.hpp:90
detail::cloned_type< Pointer > clone(const Pointer &p)
Creates a unique clone of the object pointed to by p.
Definition utils_helper.hpp:173
A type representing the dimensions of a multidimensional object.
Definition dim.hpp:26
 Calculate initial residual by overwriting b
A->apply(one, x, neg_one, b);
b->compute_norm2(initres_vec);
std::unique_ptr< Matrix > initialize(size_type stride, std::initializer_list< typename Matrix::value_type > vals, std::shared_ptr< const Executor > exec, TArgs &&... create_args)
Creates and initializes a column-vector.
Definition dense.hpp:1574
 Build lower-precision system matrix and residual
auto solver_A = solver_mtx::create(exec);
auto inner_residual = solver_vec::create(exec);
auto outer_residual = vec::create(exec);
A->convert_to(solver_A);
b->convert_to(outer_residual);
restore b
Create inner solver
auto inner_solver =
    cg::build()
        .with_criteria(
            gko::stop::ResidualNorm<SolverType>::build()
                .with_reduction_factor(inner_reduction_factor),
            gko::stop::Iteration::build().with_max_iters(max_inner_iters))
        .on(exec)
        ->generate(give(solver_A));
Solve system
exec->synchronize();
std::chrono::nanoseconds time(0);
auto initres = exec->copy_val_to_host(initres_vec->get_const_values());
auto inner_solution = solver_vec::create(exec);
auto outer_delta = vec::create(exec);
auto tic = std::chrono::steady_clock::now();
int iter = -1;
while (true) {
    ++iter;
convert residual to inner precision
outer_residual->convert_to(inner_residual);
outer_residual->compute_norm2(res_vec);
auto res = exec->copy_val_to_host(res_vec->get_const_values());
break if we exceed the number of iterations or have converged
if (iter > max_outer_iters || res / initres < outer_reduction_factor) {
    break;
}
Use the inner solver to solve A * inner_solution = inner_residual with residual as initial guess.
inner_solution->copy_from(inner_residual);
inner_solver->apply(inner_residual, inner_solution);
convert inner solution to outer precision
inner_solution->convert_to(outer_delta);
x = x + inner_solution
x->add_scaled(one, outer_delta);
residual = b - A * x
    outer_residual->copy_from(b);
    A->apply(neg_one, x, one, outer_residual);
}
 
auto toc = std::chrono::steady_clock::now();
time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
Calculate residual
A->apply(one, x, neg_one, b);
b->compute_norm2(res_vec);
 
std::cout << "Initial residual norm sqrt(r^T r):\n";
write(std::cout, initres_vec);
std::cout << "Final residual norm sqrt(r^T r):\n";
write(std::cout, res_vec);
Print solver statistics
    std::cout << "MPIR iteration count:     " << iter << std::endl;
    std::cout << "MPIR execution time [ms]: "
              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
}
 
Results
This is the expected output:
Initial residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
194.679
Final residual norm sqrt(r^T r):
%%MatrixMarket matrix array real general
1 1
1.22728e-10
MPIR iteration count:     25
MPIR execution time [ms]: 0.846559
Comments about programming and debugging 
 
The plain program
 
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <string>
 
#include <ginkgo/ginkgo.hpp>
 
 
int main(int argc, char* argv[])
{
    using ValueType = double;
    using SolverType = float;
    using IndexType = int;
 
    RealValueType outer_reduction_factor{1e-12};
    RealSolverType inner_reduction_factor{1e-2};
 
 
    if (argc == 2 && (std::string(argv[1]) == "--help")) {
        std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
        std::exit(-1);
    }
 
    const auto executor_string = argc >= 2 ? argv[1] : "reference";
    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
        exec_map{
            {"cuda",
             [] {
             }},
            {"hip",
             [] {
             }},
            {"dpcpp",
             [] {
                     0, gko::ReferenceExecutor::create());
             }},
            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
 
    const auto exec = exec_map.at(executor_string)();  
 
    auto host_x = vec::create(exec->get_master(), 
gko::dim<2>(size, 1));
 
    for (auto i = 0; i < size; i++) {
        host_x->at(i, 0) = 1.;
    }
 
    A->apply(one, x, neg_one, b);
    b->compute_norm2(initres_vec);
 
    auto solver_A = solver_mtx::create(exec);
    auto inner_residual = solver_vec::create(exec);
    auto outer_residual = vec::create(exec);
    A->convert_to(solver_A);
    b->convert_to(outer_residual);
 
    b->copy_from(host_x);
 
    auto inner_solver =
        cg::build()
            .with_criteria(
                gko::stop::ResidualNorm<SolverType>::build()
                    .with_reduction_factor(inner_reduction_factor),
                gko::stop::Iteration::build().with_max_iters(max_inner_iters))
            .on(exec)
            ->generate(
give(solver_A));
 
    exec->synchronize();
    std::chrono::nanoseconds time(0);
    auto initres = exec->copy_val_to_host(initres_vec->get_const_values());
    auto inner_solution = solver_vec::create(exec);
    auto outer_delta = vec::create(exec);
    auto tic = std::chrono::steady_clock::now();
    int iter = -1;
    while (true) {
        ++iter;
 
        outer_residual->convert_to(inner_residual);
        outer_residual->compute_norm2(res_vec);
        auto res = exec->copy_val_to_host(res_vec->get_const_values());
 
        if (iter > max_outer_iters || res / initres < outer_reduction_factor) {
            break;
        }
 
        inner_solution->copy_from(inner_residual);
        inner_solver->apply(inner_residual, inner_solution);
 
        inner_solution->convert_to(outer_delta);
 
        x->add_scaled(one, outer_delta);
 
        outer_residual->copy_from(b);
        A->apply(neg_one, x, one, outer_residual);
    }
 
    auto toc = std::chrono::steady_clock::now();
    time += std::chrono::duration_cast<std::chrono::nanoseconds>(toc - tic);
 
    A->apply(one, x, neg_one, b);
    b->compute_norm2(res_vec);
 
    std::cout << "Initial residual norm sqrt(r^T r):\n";
    write(std::cout, initres_vec);
 
    std::cout << "Final residual norm sqrt(r^T r):\n";
    write(std::cout, res_vec);
 
 
    std::cout << "MPIR iteration count:     " << iter << std::endl;
    std::cout << "MPIR execution time [ms]: "
              << static_cast<double>(time.count()) / 1000000.0 << std::endl;
}
CSR is a matrix format which stores only the nonzero coefficients by compressing each row of the matr...
Definition csr.hpp:126
Dense is a matrix format which explicitly stores all values of the matrix.
Definition dense.hpp:120
CG or the conjugate gradient method is an iterative type Krylov subspace method which is suitable for...
Definition cg.hpp:50
constexpr T one()
Returns the multiplicative identity for T.
Definition math.hpp:654
typename detail::remove_complex_s< T >::type remove_complex
Obtain the type which removed the complex of complex/scalar type or the template parameter of class b...
Definition math.hpp:264
void write(StreamType &&os, MatrixPtrType &&matrix, layout_type layout=detail::mtx_io_traits< std::remove_cv_t< detail::pointee< MatrixPtrType > > >::default_layout)
Writes a matrix into an output stream in matrix market format.
Definition mtx_io.hpp:295
std::remove_reference< OwningPointer >::type && give(OwningPointer &&p)
Marks that the object pointed to by p can be given to the callee.
Definition utils_helper.hpp:247
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Marks the object pointed to by p as shared.
Definition utils_helper.hpp:224