diff --git a/CMakeLists.txt b/CMakeLists.txt
index bf1fee28..894a3d99 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -9,12 +9,24 @@ option( BUILD_UNITY "enables unity build for faster compile times" ON )
 option( BUILD_CODE_COV "enables compiler option required for code coverage analysis" OFF )
 option( BUILD_ML "enables build with tensorflow backend access" OFF )
 option( BUILD_MPI "enables build with MPI access" OFF )
+option( BUILD_CUDA_HPC "enables CUDA backend for SN HPC solver (single GPU)" OFF )
 #################################################
 
 
 ### COMPILER ####################################
 set( CMAKE_CXX_STANDARD 17 )
 set( CMAKE_CXX_STANDARD_REQUIRED ON )
+if( BUILD_CUDA_HPC )
+    enable_language( CUDA )
+    if( CMAKE_VERSION VERSION_LESS 3.18 )
+        # CMake 3.16 does not ship CUDA17 mappings by default.
+        set( CMAKE_CUDA17_STANDARD_COMPILE_OPTION "--std=c++17" )
+        set( CMAKE_CUDA17_EXTENSION_COMPILE_OPTION "--std=c++17" )
+    endif()
+    set( CMAKE_CUDA_STANDARD 17 )
+    set( CMAKE_CUDA_STANDARD_REQUIRED ON )
+    set( CMAKE_CUDA_EXTENSIONS OFF )
+endif()
 set( KITRT_RELEASE_OPTIONS -march=native -w )
 set( KITRT_RELWITHDEBINFO_OPTIONS -march=native -pg -no-pie )
 set( KITRT_DEBUG_OPTIONS -Wall -Wextra -Wpedantic )
@@ -30,6 +42,21 @@ set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
 
 find_package( OpenMP REQUIRED )
 
+if( BUILD_CUDA_HPC )
+    add_compile_definitions( KITRT_ENABLE_CUDA_HPC )
+    if( CMAKE_VERSION VERSION_GREATER_EQUAL 3.17 )
+        find_package( CUDAToolkit QUIET )
+    endif()
+    if( DEFINED CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES )
+        include_directories( ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} )
+    elseif( EXISTS "/usr/local/cuda/include" )
+        include_directories( /usr/local/cuda/include )
+    endif()
+    message( STATUS "CUDA HPC solver: enabled" )
+else()
+    message( STATUS "CUDA HPC solver: disabled" )
+endif()
+
 message(STATUS "MPI build flag: ${BUILD_MPI}")
 if( BUILD_MPI )
     add_definitions(-DIMPORT_MPI)
@@ -72,6 +99,20 @@ else()
     set( CORE_LIBRARIES ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}  ${VTK_LIBRARIES} OpenMP::OpenMP_CXX -lstdc++fs )
 endif()
 
+if( BUILD_CUDA_HPC )
+    if( TARGET CUDA::cudart )
+        list( APPEND CORE_LIBRARIES CUDA::cudart )
+    else()
+        find_library( CUDART_LIBRARY
+                      NAMES cudart
+                      HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES} /usr/local/cuda/lib64 /usr/lib/x86_64-linux-gnu )
+        if( NOT CUDART_LIBRARY )
+            message( FATAL_ERROR "Could not find CUDA runtime library (cudart)." )
+        endif()
+        list( APPEND CORE_LIBRARIES ${CUDART_LIBRARY} )
+    endif()
+endif()
+
 
 #################################################
 
@@ -89,6 +130,12 @@ add_compile_definitions( GIT_HASH="${GIT_HASH}" )
 
 ### BUILD KIT-RT ################################
 file( GLOB_RECURSE SRCS RELATIVE ${CMAKE_SOURCE_DIR} "src/*.cpp" "include/*.h" )
+if( BUILD_CUDA_HPC )
+    list( APPEND SRCS "src/solvers/snsolver_hpc.cu" "include/solvers/snsolver_hpc_cuda.hpp" )
+    # Force nvcc to parse this translation unit as C++17 even if transitive
+    # imported target features inject an older fallback standard.
+    set_source_files_properties( src/solvers/snsolver_hpc.cu PROPERTIES COMPILE_FLAGS "--std=c++17" )
+endif()
 set( EXCLUDE_DIR "/gui/" )
 foreach( TMP_PATH ${SRCS} )
     string( FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND )
@@ -110,9 +157,15 @@ if( BUILD_ML )
     target_link_libraries( ${CMAKE_PROJECT_NAME} ${CORE_LIBRARIES} ${TENSORFLOW_LIB})
 endif()
 target_link_libraries( ${CMAKE_PROJECT_NAME} ${CORE_LIBRARIES} )
-target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<CONFIG:DEBUG>:${KITRT_DEBUG_OPTIONS}>" )
-target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<CONFIG:RELWITHDEBINFO>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
-target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<CONFIG:RELEASE>:${KITRT_RELEASE_OPTIONS}>" )
+if( BUILD_CUDA_HPC )
+    set_target_properties(
+        ${CMAKE_PROJECT_NAME}
+        PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON CUDA_EXTENSIONS OFF
+    )
+endif()
+target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${KITRT_DEBUG_OPTIONS}>" )
+target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELWITHDEBINFO>>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
+target_compile_options( ${CMAKE_PROJECT_NAME} PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${KITRT_RELEASE_OPTIONS}>" )
 #################################################
 
 ### BUILD UNIT TESTS ############################
@@ -145,11 +198,17 @@ if( BUILD_TESTING )
     target_compile_definitions( unit_tests PUBLIC BUILD_TESTING )
     target_compile_definitions( unit_tests PUBLIC TESTS_PATH="${CMAKE_SOURCE_DIR}/tests/" )
     target_link_libraries( unit_tests Catch ${CORE_LIBRARIES} )
-    target_compile_options( unit_tests PUBLIC "$<$<CONFIG:DEBUG>:${KITRT_DEBUG_OPTIONS}>" )
+    if( BUILD_CUDA_HPC )
+        set_target_properties(
+            unit_tests
+            PROPERTIES CUDA_STANDARD 17 CUDA_STANDARD_REQUIRED ON CUDA_EXTENSIONS OFF
+        )
+    endif()
+    target_compile_options( unit_tests PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${KITRT_DEBUG_OPTIONS}>" )
     if( BUILD_CODE_COV )
         if( CMAKE_COMPILER_IS_GNUCXX )
             set( CODE_COVERAGE_OPTIONS --coverage -g -O0 -w )
-            target_compile_options( unit_tests PUBLIC "$<$<CONFIG:DEBUG>:${CODE_COVERAGE_OPTIONS}>" )
+            target_compile_options( unit_tests PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${CODE_COVERAGE_OPTIONS}>" )
             target_link_libraries( unit_tests Catch gcov )
         else()
             message( FATAL_ERROR "Code coverage is currently only supported for gcc!" )
@@ -157,8 +216,8 @@ if( BUILD_TESTING )
     else()
         target_link_libraries( unit_tests Catch )
     endif()
-    target_compile_options( unit_tests PUBLIC "$<$<CONFIG:RELWITHDEBINFO>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
-    target_compile_options( unit_tests PUBLIC "$<$<CONFIG:RELEASE>:${KITRT_RELEASE_OPTIONS}>" )
+    target_compile_options( unit_tests PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELWITHDEBINFO>>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
+    target_compile_options( unit_tests PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${KITRT_RELEASE_OPTIONS}>" )
     catch_discover_tests( unit_tests )
 endif()
 #################################################
@@ -179,9 +238,9 @@ if( BUILD_GUI )
     set( GUI_LIBRARIES Qt5::Core Qt5::Widgets Qt5::Gui Qt5::OpenGL ${VTK_LIBRARIES} )
     target_link_libraries( ${CMAKE_PROJECT_NAME}_gui ${CORE_LIBRARIES} ${GUI_LIBRARIES} )
     set_target_properties( ${CMAKE_PROJECT_NAME}_gui PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_SOURCE_DIR}/bin" )
-    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<CONFIG:DEBUG>:${KITRT_DEBUG_OPTIONS}>" )
-    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<CONFIG:RELWITHDEBINFO>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
-    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<CONFIG:RELEASE>:${KITRT_RELEASE_OPTIONS}>" )
+    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:DEBUG>>:${KITRT_DEBUG_OPTIONS}>" )
+    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELWITHDEBINFO>>:${KITRT_RELWITHDEBINFO_OPTIONS}>" )
+    target_compile_options( ${CMAKE_PROJECT_NAME}_gui PUBLIC "$<$<AND:$<COMPILE_LANGUAGE:CXX>,$<CONFIG:RELEASE>>:${KITRT_RELEASE_OPTIONS}>" )
 endif()
 #################################################
 
diff --git a/README.md b/README.md
index 3bdd983b..987ab671 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ KiT-RT is an open-source, multi-fidelity **C++ PDE solver for radiative transpor
 
 * **Modular, HPC-ready** architecture. Supports hybrid **MPI/OpenMP distributed parallelism**.
 * **Containerized** for portable deployment across HPC systems (Docker & Singularity).
-* **Python-wrapped** via [CharmKIT](https://github.com/KiT-RT/CharmKIT)
+* **Python-wrapped** via [charm_kit](https://github.com/KiT-RT/charm_kit)
 * Downstream applications:
   - Data generation for scientific **foundation models**.
   - high-resolution **reference solutions** for AI-based surrogate modeling.
@@ -66,79 +66,144 @@ Applications include:
 
 
 
-## Installation
+## Installation and Run (By Parallelization)
 
-### Plain cpp setup
+One-time setup:
 ```bash
-# Clone repository
 git clone https://github.com/KiT-RT/kitrt_code.git
 cd kitrt_code
 git submodule update --init --recursive
+```
 
-# Build with CMake
-mkdir build && cd build
-cmake -DCMAKE_BUILD_TYPE=Release ../
-make -j
+Then run all commands from the repository root.
 
-```
-### Docker setup 
-A preconfigured docker container can also be used to run the code.
-By running
+### 1. CPU (OpenMP only)
 
+#### 1a) Plain installation (no container)
 ```bash
-docker run --rm -ti -v $(pwd):/home kitrt/test:latest
+mkdir -p build_omp
+cd build_omp
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+make -j
+cd ..
+./build_omp/KiT-RT tests/input/validation_tests/SN_solver/checkerboard_SN.cfg
 ```
 
-Bash scripts are provided in the folder tools/CI to get started with the docker environments. To start an interactive docker environment, execute
+#### 1b) Docker installation
 ```bash
-docker run -i -t --rm -v $(pwd)/../..:/mnt kitrt/test:latest /bin/bash
+docker run --rm -it -v $(pwd):/mnt -w /mnt kitrt/test:latest /bin/bash
+mkdir -p build_docker_omp
+cd build_docker_omp
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+make -j
+cd ..
+./build_docker_omp/KiT-RT tests/input/validation_tests/SN_solver/checkerboard_SN.cfg
 ```
 
-### Singularity setup 
-Create the singularity container
+#### 1c) Singularity installation
 ```bash
-mkdir build_singularity
 cd tools/singularity
-sudo sh build_container.sh
-chmod +x install_kitrt_singularity.sh
-singularity exec kit_rt.sif ./install_kitrt_singularity.sh
+sudo singularity build kit_rt.sif kit_rt.def
+cd ../..
+mkdir -p build_singularity_omp
+cd build_singularity_omp
+singularity exec ../tools/singularity/kit_rt.sif \
+  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+singularity exec ../tools/singularity/kit_rt.sif make -j
+cd ..
+singularity exec tools/singularity/kit_rt.sif \
+  ./build_singularity_omp/KiT-RT tests/input/validation_tests/SN_solver/checkerboard_SN.cfg
+```
+
+### 2. CPU (OpenMP + MPI)
+
+#### 2a) Plain installation (no container)
+```bash
+mkdir -p build_mpi
+cd build_mpi
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+make -j
+cd ..
+mpirun -np 4 ./build_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
 ```
-Run the singularity container
+
+#### 2b) Singularity installation
 ```bash
-singularity shell --bind $(pwd)/../..:/mnt kit_rt.sif
+cd tools/singularity
+sudo singularity build kit_rt_MPI.sif kit_rt_MPI.def
+cd ../..
+mkdir -p build_singularity_mpi
+cd build_singularity_mpi
+singularity exec ../tools/singularity/kit_rt_MPI.sif \
+  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=ON -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+singularity exec ../tools/singularity/kit_rt_MPI.sif make -j
+cd ..
+singularity exec tools/singularity/kit_rt_MPI.sif \
+  mpirun -np 4 ./build_singularity_mpi/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
 ```
 
-## Running simulations
-Within any of the above setups, navigate to the example folder and execute KiT-RT
+### 3. CPU + single GPU (OpenMP + CUDA)
+
+#### 3a) Singularity installation
 ```bash
-cd examples
-../<build_folder_name>/KiT-RT configs/lattice_SN.cfg
+cd tools/singularity
+sudo singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
+cd ../..
+mkdir -p build_singularity_cuda
+cd build_singularity_cuda
+singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif \
+  cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
+singularity exec --nv ../tools/singularity/kit_rt_MPI_cuda.sif make -j
+cd ..
+singularity exec --nv tools/singularity/kit_rt_MPI_cuda.sif \
+  ./build_singularity_cuda/KiT-RT tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
 ```
 
-## Tensorflow backend
-If you choose to enable the integrated machine learning tools via the BUILD_ML option, you need to install the tensorflow C-API:
+When compiled with `-DBUILD_CUDA_HPC=ON`, HPC runs use the CUDA backend if a GPU is visible, and fall back to CPU if no GPU is detected.
+
+### 4. Build with TensorFlow backend (CPU + OpenMP only)
+
 ```bash
 FILENAME=libtensorflow-cpu-linux-x86_64-2.7.0.tar.gz
 wget -q --no-check-certificate https://storage.googleapis.com/tensorflow/libtensorflow/${FILENAME}
-tar -C /usr/local -xzf ${FILENAME}
-ldconfig /usr/local/lib
+sudo tar -C /usr/local -xzf ${FILENAME}
+sudo ldconfig /usr/local/lib
+rm ${FILENAME}
+mkdir -p build_ml
+cd build_ml
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=OFF -DBUILD_ML=ON ..
+make -j
+cd ..
+./build_ml/KiT-RT tests/input/validation_tests/MN_solver/checkerboard_MN_neural.cfg
 ```
-and for a gpu based version (you need supported hardware and gpu drivers, see here ):
+
+### 5. Debug build
+
 ```bash
-FILENAME=libtensorflow-gpu-linux-x86_64-2.7.0.tar.gz
-wget -q --no-check-certificate https://storage.googleapis.com/tensorflow/libtensorflow/${FILENAME}
-tar -C /usr/local -xzf ${FILENAME}
-ldconfig /usr/local/lib
+mkdir -p build_debug
+cd build_debug
+cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=OFF -DBUILD_ML=OFF ..
+make -j
+cd ..
+./build_debug/KiT-RT tests/input/validation_tests/SN_solver/checkerboard_SN.cfg
 ```
-Or use the docker container
+
+### 6. Test + coverage build
+
 ```bash
-docker run --rm -ti -v $(pwd):/home kitrt/test_ml:latest
+mkdir -p build_coverage
+cd build_coverage
+cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DBUILD_CODE_COV=ON -DBUILD_UNITY=OFF ..
+make -j
+./unit_tests
+ctest --output-on-failure
+gcovr -r .. --html-details coverage.html
 ```
 
 
 ## Python API
 
-The Python interface is provided via [CharmKIT](https://github.com/KiT-RT/CharmKIT), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows. 
+The Python interface is provided via [charm_kit](https://github.com/KiT-RT/charm_kit), allowing seamless integration into AI and outer-loop (UQ, Optimization) workflows. 
 Check the corresponding readme for further info
 
 
diff --git a/include/solvers/snsolver_hpc.hpp b/include/solvers/snsolver_hpc.hpp
index da61e072..c9753b41 100644
--- a/include/solvers/snsolver_hpc.hpp
+++ b/include/solvers/snsolver_hpc.hpp
@@ -80,7 +80,8 @@ class SNSolverHPC
     std::vector<double> _flux; /*!< @brief dim = _nCells x _nSys */
 
     // Output related members
-    std::vector<double> _scalarFlux; /*!< @brief dim = _nCells  */
+    std::vector<double> _scalarFlux;         /*!< @brief dim = _nCells  */
+    std::vector<double> _scalarFluxPrevIter; /*!< @brief scalar flux snapshot before current solver iteration */
 
     // Lattice QOIS
     unsigned _nOutputMoments;
diff --git a/include/solvers/snsolver_hpc_cuda.hpp b/include/solvers/snsolver_hpc_cuda.hpp
new file mode 100644
index 00000000..5a6f73f0
--- /dev/null
+++ b/include/solvers/snsolver_hpc_cuda.hpp
@@ -0,0 +1,246 @@
+#ifndef SNSOLVERHPCCUDA_H
+#define SNSOLVERHPCCUDA_H
+
+// include Matrix, Vector definitions
+#include "common/globalconstants.hpp"
+#include "common/typedef.hpp"
+
+// externals
+#include "spdlog/spdlog.h"
+
+// Forward Declarations
+class Config;
+class Mesh;
+class ProblemBase;
+
+/*! @brief Base class for all solvers. */
+class SNSolverHPCCUDA
+{
+
+  private:
+    struct DeviceBuffers;
+
+    int _rank;
+    int _numProcs;
+    unsigned long _localNSys;
+    unsigned long _startSysIdx;
+    unsigned long _endSysIdx;
+
+    double _curSimTime;/*!< @brief current in-simulation time after current iteration     */
+    double _currTime;  /*!< @brief wall-time after current iteration */
+    Config* _settings; /*!< @brief config class for global information */
+    Mesh* _mesh;
+    ProblemBase* _problem;
+
+    // Time
+    unsigned long _nIter; /*!< @brief number of time steps, for non CSD, this equals _nEnergies, for _csd, _maxIter =_nEnergies-1*/
+    double _dT;           /*!< @brief energy/time step size */
+    int _idx_start_iter;  /*!< @brief index of first iteration */
+    // Mesh related members, memory optimized
+    unsigned long _nCells; /*!< @brief number of spatial cells */
+    unsigned long _nSys;   /*!< @brief number of equations in the transport system, i.e. num quad pts */
+    unsigned long _nq;     /*!< @brief number of quadrature points */
+    unsigned long _nDim;
+    unsigned long _nNbr;
+    unsigned long _nNodes;
+
+    std::vector<double> _areas;         /*!< @brief surface area of all spatial cells,
+                                           dim(_areas) = _NCells */
+    std::vector<double> _normals;       /*!< @brief edge normals multiplied by edge length,
+                                                        dim(_normals) = (_NCells,nEdgesPerCell,spatialDim) */
+    std::vector<unsigned> _neighbors;   /*!< @brief edge neighbor cell ids, dim(_neighbors) = (_NCells,nEdgesPerCell) */
+    std::vector<double> _cellMidPoints; /*!< @brief dim _nCells x _dim */
+
+    std::vector<double> _interfaceMidPoints;             /*!< @brief dim: _nCells x _nEdgesPerCell x _dim */
+    std::vector<BOUNDARY_TYPE> _cellBoundaryTypes;       /*!< @brief dim: _nCells x _nEdgesPerCell x _dim */
+    std::map<unsigned, std::vector<double>> _ghostCells; /*!< @brief Vector of ghost cells for boundary conditions. CAN BE MORE EFFICIENT */
+    std::vector<double> _relativeInterfaceMidPt;         /*!< @brief dim _nCells * _nNbr * _nDim */
+    std::vector<double> _relativeCellVertices;           /*!< @brief dim _nCells * _nNbr * _nDim */
+
+    std::map<unsigned, bool> _ghostCellsReflectingY; /*!< map that indicates if a ghostcell has a fixed value or is a mirroring boundary */
+    std::map<unsigned, bool> _ghostCellsReflectingX; /*!< map that indicates if a ghostcell has a fixed value or is a mirroring boundary */
+    std::vector<unsigned> _quadratureYReflection;    /*!< map that gives a Reflection against the y axis for the velocity ordinates */
+    std::vector<unsigned> _quadratureXReflection;    /*!< map that gives a Reflection against the y axis for the velocity ordinates */
+
+    unsigned _temporalOrder;      /*!< @brief temporal order (current: 1 & 2) */
+    unsigned _spatialOrder;       /*!< @brief spatial order (current: 1 & 2) */
+    std::vector<double> _solDx;   /*!< @brief dim = _nCells x _nSys x _dim*/
+    std::vector<double> _limiter; /*!< @brief dim = _nCells x _nSys */
+
+    // Scattering, absorption and source
+    std::vector<double> _sigmaS;           /*!< @brief dim: _nCells */
+    std::vector<double> _sigmaT;           /*!< @brief dim: _nCells */
+    std::vector<double> _source;           /*!< @brief dim: _nCells x _nSys */
+    std::vector<double> _scatteringKernel; /*!< @brief dim: _nSys x _nSys  */
+
+    // quadrature related numbers
+    std::vector<double> _quadPts;     /*!< @brief dim: _nSys x _dim*/
+    std::vector<double> _quadWeights; /*!< @brief dim: _nSys*/
+
+    // Solution related members
+    std::vector<double> _sol;  /*!< @brief dim = _nCells x _nSys */
+    std::vector<double> _flux; /*!< @brief dim = _nCells x _nSys */
+
+    // Output related members
+    std::vector<double> _scalarFlux;         /*!< @brief dim = _nCells  */
+    std::vector<double> _scalarFluxPrevIter; /*!< @brief scalar flux snapshot before current solver iteration */
+
+    // Lattice QOIS
+    unsigned _nOutputMoments;
+
+    double _mass;
+    double _rmsFlux;
+    double _curAbsorptionLattice;    /*!< @brief Absorption of particles at Lattice checkerboard regions at current time step */
+    double _totalAbsorptionLattice;  /*!< @brief Absorption of particles at Lattice checkerboard regions integrated until current time step */
+    double _curMaxAbsorptionLattice; /*!< @brief Maximum pointwise absorption of particles at Lattice checkerboard regions  until current time step */
+    double _curScalarOutflow;        /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflow;      /*!< @brief Outflow over whole boundary integrated until current time step */
+    double _curMaxOrdinateOutflow;   /*!< @brief Maximum ordinate-wise ouftlow  over boundary over all time steps */
+    std::vector<double> _localMaxOrdinateOutflow; /*!< @brief Maximum ordinate-wise ouftlow  over boundary over all time steps */
+    double _curScalarOutflowPeri1;                /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflowPeri1;              /*!< @brief Outflow over whole boundary integrated until current time step */
+    double _curScalarOutflowPeri2;                /*!< @brief Outflow over whole boundary at current time step */
+    double _totalScalarOutflowPeri2;              /*!< @brief Outflow over whole boundary integrated until current time step */
+    // helper
+    std::map<unsigned, std::vector<unsigned>> _cellsLatticePerimeter1;
+    std::map<unsigned, std::vector<unsigned>> _cellsLatticePerimeter2;
+    std::vector<bool> _isPerimeterLatticeCell1;
+    std::vector<bool> _isPerimeterLatticeCell2;
+
+    // Hohlraum QOIS
+    double _totalAbsorptionHohlraumCenter;
+    double _totalAbsorptionHohlraumVertical;
+    double _totalAbsorptionHohlraumHorizontal;
+    double _curAbsorptionHohlraumCenter;
+    double _curAbsorptionHohlraumVertical;
+    double _curAbsorptionHohlraumHorizontal;
+    double _varAbsorptionHohlraumGreen;
+
+    std::vector<std::vector<unsigned>> _probingCellsHohlraum; /*!< @brief Indices of cells that contain a probing sensor */
+    std::vector<double> _probingMoments;                      /*!< @brief Solution Momnets at the probing cells that contain a probing sensor */
+    unsigned _probingMomentsTimeIntervals;                    /*!< @brief Solution Momnets at the probing cells that contain a probing sensor */
+
+    unsigned _nProbingCellsLineGreen;               /*!< @brief Number of sampling cells that contain a probing sensor for the sliding window */
+    std::vector<unsigned> _probingCellsLineGreen;   /*!< @brief Indices of cells that contain a probing sensor for the sliding window */
+    std::vector<double> _absorptionValsLineSegment; /*!< @brief Avg Absorption value at the sampleing points of lineGreen */
+
+    unsigned _nProbingCellsBlocksGreen;
+    std::vector<std::vector<unsigned>> _probingCellsBlocksGreen; /*!< @brief Indices of cells that contain a probing sensor blocks */
+    std::vector<double> _absorptionValsBlocksGreen;              /*!< @brief Avg Absorption value at the sampleing blocks of lineGreen */
+
+    // Design parameters
+    std::vector<double> _cornerUpperLeftGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double> _cornerLowerLeftGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double>
+        _cornerUpperRightGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+    std::vector<double>
+        _cornerLowerRightGreen; /*!< @brief Coord of corner of the green area (minus thickness/2 of it) relative to the green center */
+
+    double _thicknessGreen;           /*!< @brief thickness of the green area */
+    std::vector<double> _centerGreen; /*!< @brief Center of the Hohlraum   */
+
+    // Output
+    std::vector<std::vector<std::vector<double>>> _outputFields; /*!< @brief Solver Output: dimensions
+                   (GroupID,FieldID,CellID).*/
+    std::vector<std::vector<std::string>> _outputFieldNames;     /*!< @brief Names of the outputFields: dimensions
+                           (GroupID,FieldID) */
+
+    std::vector<double> _screenOutputFields;          /*!< @brief Solver Output: dimensions (FieldID). */
+    std::vector<std::string> _screenOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
+
+    std::vector<double> _historyOutputFields;          /*!< @brief Solver Output: dimensions (FieldID). */
+    std::vector<std::string> _historyOutputFieldNames; /*!< @brief Names of the outputFields: dimensions (FieldID) */
+
+    // CUDA backend (single GPU for first version)
+    bool _cudaInitialized;
+    int _cudaDeviceId;
+    DeviceBuffers* _device;
+    std::vector<int> _cellBoundaryTypesInt;
+    std::vector<double> _ghostCellValues;
+
+    // ---- Member functions ----
+
+    // Solver
+    void FluxOrder1();
+    void FluxOrder2();
+
+    void FVMUpdate();
+
+    void IterPostprocessing();
+
+    /*! @brief Sets vector of ghost cell values according to boundary conditions */
+    void SetGhostCells();
+
+    // Helper
+    /*! @brief ComputeTimeStep calculates the maximal stable time step using the
+    cfl number
+    @param cfl Courant-Friedrichs-Levy condition number */
+    double ComputeTimeStep( double cfl ) const;
+
+    // IO
+    /*! @brief Initializes the output groups and fields of this solver and names
+     * the fields */
+    void PrepareVolumeOutput();
+    /*! @brief Function that prepares VTK export and csv export of the current
+       solver iteration
+        @param idx_iter  current (pseudo) time iteration */
+    void WriteVolumeOutput( unsigned idx_iter );
+    /*! @brief Save Output solution at given energy (pseudo time) to VTK file.
+       Write frequency is given by option VOLUME_OUTPUT_FREQUENCY. Always prints
+       last iteration without iteration affix.
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintVolumeOutput( int idx_iter );
+    /*! @brief Initialized the output fields and their Names for the screenoutput
+     */
+    void PrepareScreenOutput();
+
+    /*! @brief Function that writes screen and history output fields
+        @param idx_iter  current (pseudo) time iteration */
+    void WriteScalarOutput( unsigned idx_iter );
+    /*! @brief Prints ScreenOutputFields to Screen and to logger. Write frequency
+       is given by option SCREEN_OUTPUT_FREQUENCY. Always prints last iteration.
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintScreenOutput( unsigned idx_iter );
+    /*! @brief Initialized the historyOutputFields and their Names for history
+       output. Write frequency is given by option HISTORY_OUTPUT_FREQUENCY. Always
+       prints last iteration. */
+    void PrepareHistoryOutput();
+    /*! @brief Prints HistoryOutputFields to logger
+        @param idx_iter  current (pseudo) time iteration */
+    void PrintHistoryOutput( unsigned idx_iter );
+    /*! @brief Pre Solver Screen and Logger Output */
+    void DrawPreSolverOutput();
+    /*! @brief Post Solver Screen and Logger Output */
+    void DrawPostSolverOutput();
+
+    // Helper
+    unsigned long Idx2D( unsigned long idx1, unsigned long idx2, unsigned long len2 );
+    unsigned long Idx3D( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 );
+    bool IsAbsorptionLattice( double x, double y ) const;
+    void ComputeCellsPerimeterLattice();
+
+    void SetProbingCellsLineGreen();
+    void ComputeQOIsGreenProbingLine();
+    std::vector<unsigned> linspace2D( const std::vector<double>& start, const std::vector<double>& end, unsigned num_points );
+
+    void InitCUDA();
+    void UploadStaticDataToDevice();
+    void UploadStateToDevice();
+    void DownloadStateFromDevice();
+    void FreeCUDA();
+
+  public:
+    /*! @brief Solver constructor
+     *  @param settings config class that stores all needed config information */
+    SNSolverHPCCUDA( Config* settings );
+
+    ~SNSolverHPCCUDA();
+
+    /*! @brief Solve functions runs main iteration loop. Components of the solve
+     * loop are pure  and subclassed by the child solvers.  */
+    void Solve();
+
+    /*! @brief Save Output solution to VTK file */
+    void PrintVolumeOutput() const {};    // Only for debugging purposes.
+};
+#endif    // SNSOLVERHPCCUDA_H
diff --git a/src/main.cpp b/src/main.cpp
index 7a0820cb..76d8f63f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,6 +14,10 @@
 #include "common/io.hpp"
 #include "datagenerator/datageneratorbase.hpp"
 #include "solvers/snsolver_hpc.hpp"
+#ifdef KITRT_ENABLE_CUDA_HPC
+#include <cuda_runtime.h>
+#include "solvers/snsolver_hpc_cuda.hpp"
+#endif
 #include "solvers/solverbase.hpp"
 
 #ifdef BUILD_GUI
@@ -59,10 +63,23 @@ int main( int argc, char** argv ) {
     else {
         // Build solver
         if( config->GetHPC() ) {
+#ifdef KITRT_ENABLE_CUDA_HPC
+            int deviceCount = 0;
+            if( cudaGetDeviceCount( &deviceCount ) == cudaSuccess && deviceCount > 0 ) {
+                SNSolverHPCCUDA* solver = new SNSolverHPCCUDA( config );
+                solver->Solve();
+                delete solver;
+            }
+            else {
+                SNSolverHPC* solver = new SNSolverHPC( config );
+                solver->Solve();
+                delete solver;
+            }
+#else
             SNSolverHPC* solver = new SNSolverHPC( config );
-            // Run solver and export
             solver->Solve();
             delete solver;
+#endif
         }
         else {
             SolverBase* solver = SolverBase::Create( config );
diff --git a/src/solvers/snsolver_hpc.cpp b/src/solvers/snsolver_hpc.cpp
index 162c0b4e..678a78d3 100644
--- a/src/solvers/snsolver_hpc.cpp
+++ b/src/solvers/snsolver_hpc.cpp
@@ -92,6 +92,7 @@ SNSolverHPC::SNSolverHPC( Config* settings ) {
     _flux = std::vector<double>( _nCells * _localNSys );
 
     _scalarFlux              = std::vector<double>( _nCells );
+    _scalarFluxPrevIter      = std::vector<double>( _nCells );
     _localMaxOrdinateOutflow = std::vector<double>( _nCells );
 
     auto areas           = _mesh->GetCellAreas();
@@ -167,7 +168,6 @@ SNSolverHPC::SNSolverHPC( Config* settings ) {
 
             _scalarFlux[idx_cell] += _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
         }
-        // _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
     }
 
     // Lattice
@@ -298,6 +298,7 @@ void SNSolverHPC::Solve() {
         if( iter == _nIter - 1 ) {    // last iteration
             _dT = _settings->GetTEnd() - iter * _dT;
         }
+        _scalarFluxPrevIter = _scalarFlux;
         if( _temporalOrder == 2 ) {
             std::vector<double> solRK0( _sol );
             ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
@@ -554,12 +555,9 @@ void SNSolverHPC::FluxOrder1() {
 }
 
 void SNSolverHPC::FVMUpdate() {
-    _mass    = 0.0;
-    _rmsFlux = 0.0;
     std::vector<double> temp_scalarFlux( _nCells );    // for MPI allreduce
-    std::vector<double> prev_scalarFlux( _scalarFlux );
 
-#pragma omp parallel for reduction( + : _mass )
+#pragma omp parallel for
     for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
 
 #pragma omp simd
@@ -577,7 +575,6 @@ void SNSolverHPC::FVMUpdate() {
             _sol[Idx2D( idx_cell, idx_sys, _localNSys )] = std::max( _sol[Idx2D( idx_cell, idx_sys, _localNSys )], 0.0 );
             localScalarFlux += _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
         }
-        _mass += localScalarFlux * _areas[idx_cell];
         temp_scalarFlux[idx_cell] = localScalarFlux;    // set flux
     }
 // MPI Allreduce: _scalarFlux
@@ -585,25 +582,22 @@ void SNSolverHPC::FVMUpdate() {
     MPI_Barrier( MPI_COMM_WORLD );
     MPI_Allreduce( temp_scalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
     MPI_Barrier( MPI_COMM_WORLD );
-    if( _rank == 0 ) {
-        for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
-            double diff = _scalarFlux[idx_cell] - prev_scalarFlux[idx_cell];
-            _rmsFlux += diff * diff;
-        }
-    }
 #endif
 #ifndef IMPORT_MPI
     _scalarFlux = temp_scalarFlux;
-#pragma omp parallel for reduction( + : _rmsFlux )
-    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
-        double diff = _scalarFlux[idx_cell] - prev_scalarFlux[idx_cell];
-        _rmsFlux += diff * diff;
-    }
 #endif
 }
 
 void SNSolverHPC::IterPostprocessing() {
     // ALREDUCE NEEDED
+    _mass    = 0.0;
+    _rmsFlux = 0.0;
+#pragma omp parallel for reduction( + : _mass, _rmsFlux )
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+        _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
+        double diff = _scalarFlux[idx_cell] - _scalarFluxPrevIter[idx_cell];
+        _rmsFlux += diff * diff;
+    }
 
     _curAbsorptionLattice            = 0.0;
     _curScalarOutflow                = 0.0;
@@ -747,8 +741,6 @@ void SNSolverHPC::IterPostprocessing() {
     double tmp_curScalarOutflowPeri1 = 0.0;
     double tmp_curScalarOutflowPeri2 = 0.0;
     double tmp_curMaxOrdinateOutflow = 0.0;
-    double tmp_mass                  = 0.0;
-    double tmp_rmsFlux               = 0.0;
     MPI_Barrier( MPI_COMM_WORLD );
     MPI_Allreduce( &_curScalarOutflow, &tmp_curScalarOutflow, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
     _curScalarOutflow = tmp_curScalarOutflow;
@@ -762,12 +754,6 @@ void SNSolverHPC::IterPostprocessing() {
     MPI_Allreduce( &_curMaxOrdinateOutflow, &tmp_curMaxOrdinateOutflow, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD );
     _curMaxOrdinateOutflow = tmp_curMaxOrdinateOutflow;
     MPI_Barrier( MPI_COMM_WORLD );
-    MPI_Allreduce( &_mass, &tmp_mass, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
-    _mass = tmp_mass;
-    MPI_Barrier( MPI_COMM_WORLD );
-    MPI_Allreduce( &_rmsFlux, &tmp_rmsFlux, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
-    _rmsFlux = tmp_rmsFlux;
-    MPI_Barrier( MPI_COMM_WORLD );
 #endif
     // Variation absorption (part II)
     if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
diff --git a/src/solvers/snsolver_hpc.cu b/src/solvers/snsolver_hpc.cu
new file mode 100644
index 00000000..e016c797
--- /dev/null
+++ b/src/solvers/snsolver_hpc.cu
@@ -0,0 +1,2066 @@
+#ifdef IMPORT_MPI
+#include <mpi.h>
+#endif
+#include "common/config.hpp"
+#include "common/io.hpp"
+#include "common/mesh.hpp"
+#include "kernels/scatteringkernelbase.hpp"
+#include "problems/problembase.hpp"
+#include "quadratures/quadraturebase.hpp"
+#include "solvers/snsolver_hpc_cuda.hpp"
+#include "toolboxes/textprocessingtoolbox.hpp"
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <limits>
+#include <string>
+#include <cassert>
+
+struct SNSolverHPCCUDA::DeviceBuffers
+{
+    double* areas                = nullptr;
+    double* normals              = nullptr;
+    unsigned* neighbors          = nullptr;
+    int* boundaryTypes           = nullptr;
+    double* ghostCellValues      = nullptr;
+    double* cellMidPoints        = nullptr;
+    double* interfaceMidPoints   = nullptr;
+    double* relativeInterfaceMid = nullptr;
+    double* relativeCellVertices = nullptr;
+
+    double* sigmaS      = nullptr;
+    double* sigmaT      = nullptr;
+    double* source      = nullptr;
+    double* quadPts     = nullptr;
+    double* quadWeights = nullptr;
+
+    double* sol        = nullptr;
+    double* solRK0     = nullptr;
+    double* flux       = nullptr;
+    double* scalarFlux = nullptr;
+    double* solDx      = nullptr;
+    double* limiter    = nullptr;
+};
+
+namespace {
+
+constexpr int CUDA_BLOCK_SIZE = 256;
+
+inline void CheckCuda( cudaError_t status, const std::string& where ) {
+    if( status != cudaSuccess ) {
+        ErrorMessages::Error( "CUDA error in " + where + ": " + std::string( cudaGetErrorString( status ) ), where );
+    }
+}
+
+template <typename T>
+inline void AllocateDeviceArray( T** ptr, std::size_t count, const std::string& name ) {
+    if( count == 0 ) {
+        *ptr = nullptr;
+        return;
+    }
+    CheckCuda( cudaMalloc( reinterpret_cast<void**>( ptr ), count * sizeof( T ) ), "cudaMalloc(" + name + ")" );
+}
+
+template <typename T>
+inline void FreeDeviceArray( T*& ptr, const std::string& name ) {
+    if( ptr == nullptr ) return;
+    CheckCuda( cudaFree( ptr ), "cudaFree(" + name + ")" );
+    ptr = nullptr;
+}
+
+__device__ __forceinline__ unsigned long Idx2DDevice( unsigned long idx1, unsigned long idx2, unsigned long len2 ) { return idx1 * len2 + idx2; }
+
+__device__ __forceinline__ unsigned long Idx3DDevice( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 ) {
+    return idx1 * len2 * len3 + idx2 * len3 + idx3;
+}
+
+__global__ void FluxOrder1Kernel( unsigned long nCells,
+                                  unsigned long localNSys,
+                                  unsigned long nNbr,
+                                  unsigned long nDim,
+                                  const double* __restrict__ quadPts,
+                                  const double* __restrict__ normals,
+                                  const unsigned* __restrict__ neighbors,
+                                  const int* __restrict__ boundaryTypes,
+                                  const double* __restrict__ ghostCellValues,
+                                  const double* __restrict__ sol,
+                                  double* flux,
+                                  int neumannBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    double localFlux = 0.0;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double localInner =
+            quadPts[Idx2DDevice( idxSys, 0, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+            quadPts[Idx2DDevice( idxSys, 1, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        if( boundaryTypes[idxCell] == neumannBoundary && nbrCell == nCells ) {
+            localFlux += ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                              : localInner * ghostCellValues[Idx2DDevice( idxCell, idxSys, localNSys )];
+        }
+        else {
+            localFlux +=
+                ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                     : localInner * sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )];
+        }
+    }
+    flux[Idx2DDevice( idxCell, idxSys, localNSys )] = localFlux;
+}
+
+__global__ void FluxOrder2SlopeKernel( unsigned long nCells,
+                                       unsigned long localNSys,
+                                       unsigned long nNbr,
+                                       unsigned long nDim,
+                                       const double* __restrict__ areas,
+                                       const double* __restrict__ normals,
+                                       const unsigned* __restrict__ neighbors,
+                                       const int* __restrict__ boundaryTypes,
+                                       const double* __restrict__ sol,
+                                       double* solDx,
+                                       double* limiter,
+                                       int noneBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    const unsigned long limiterIdx = Idx2DDevice( idxCell, idxSys, localNSys );
+    const unsigned long solDxXIdx  = Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim );
+    const unsigned long solDxYIdx  = Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim );
+
+    if( boundaryTypes[idxCell] != noneBoundary ) {
+        limiter[limiterIdx] = 0.0;
+        solDx[solDxXIdx]    = 0.0;
+        solDx[solDxYIdx]    = 0.0;
+        return;
+    }
+
+    double slopeX = 0.0;
+    double slopeY = 0.0;
+    const double thisSol = sol[Idx2DDevice( idxCell, idxSys, localNSys )];
+
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double avgVal    = 0.5 * ( thisSol + sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )] );
+        slopeX += avgVal * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )];
+        slopeY += avgVal * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+    }
+
+    slopeX /= areas[idxCell];
+    slopeY /= areas[idxCell];
+
+    limiter[limiterIdx] = 1.0;
+    solDx[solDxXIdx]    = slopeX;
+    solDx[solDxYIdx]    = slopeY;
+}
+
+__global__ void FluxOrder2LimiterKernel( unsigned long nCells,
+                                         unsigned long localNSys,
+                                         unsigned long nNbr,
+                                         unsigned long nDim,
+                                         const double* __restrict__ areas,
+                                         const unsigned* __restrict__ neighbors,
+                                         const int* __restrict__ boundaryTypes,
+                                         const double* __restrict__ relativeCellVertices,
+                                         const double* __restrict__ sol,
+                                         const double* __restrict__ solDx,
+                                         double* limiter,
+                                         int noneBoundary,
+                                         double eps ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+    if( boundaryTypes[idxCell] != noneBoundary ) return;
+
+    const unsigned long solIdx = Idx2DDevice( idxCell, idxSys, localNSys );
+    const double thisSol       = sol[solIdx];
+
+    double minSol = thisSol;
+    double maxSol = thisSol;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double nbrSol    = sol[Idx2DDevice( static_cast<unsigned long>( nbrCell ), idxSys, localNSys )];
+        maxSol                 = fmax( maxSol, nbrSol );
+        minSol                 = fmin( minSol, nbrSol );
+    }
+
+    double limVal  = 1.0;
+    const double d = areas[idxCell];
+    const double sx = solDx[Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim )];
+    const double sy = solDx[Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim )];
+
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const double gaussPoint = sx * relativeCellVertices[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                  sy * relativeCellVertices[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        const double delta1Max = maxSol - thisSol;
+        const double delta1Min = minSol - thisSol;
+        double r               = 1.0;
+        if( gaussPoint > 0.0 ) {
+            r = ( ( delta1Max * delta1Max + d ) * gaussPoint + 2.0 * gaussPoint * gaussPoint * delta1Max ) /
+                ( delta1Max * delta1Max + 2.0 * gaussPoint * gaussPoint + delta1Max * gaussPoint + d ) / ( gaussPoint + eps );
+        }
+        else {
+            r = ( ( delta1Min * delta1Min + d ) * gaussPoint + 2.0 * gaussPoint * gaussPoint * delta1Min ) /
+                ( delta1Min * delta1Min + 2.0 * gaussPoint * gaussPoint + delta1Min * gaussPoint + d ) / ( gaussPoint - eps );
+        }
+        if( fabs( gaussPoint ) < eps ) {
+            r = 1.0;
+        }
+        limVal = fmin( limVal, r );
+    }
+    limiter[solIdx] = limVal;
+}
+
+__global__ void FluxOrder2Kernel( unsigned long nCells,
+                                  unsigned long localNSys,
+                                  unsigned long nNbr,
+                                  unsigned long nDim,
+                                  const double* __restrict__ quadPts,
+                                  const double* __restrict__ normals,
+                                  const unsigned* __restrict__ neighbors,
+                                  const int* __restrict__ boundaryTypes,
+                                  const double* __restrict__ ghostCellValues,
+                                  const double* __restrict__ interfaceMidPoints,
+                                  const double* __restrict__ cellMidPoints,
+                                  const double* __restrict__ relativeInterfaceMid,
+                                  const double* __restrict__ sol,
+                                  const double* __restrict__ solDx,
+                                  const double* __restrict__ limiter,
+                                  double* flux,
+                                  int neumannBoundary ) {
+    const unsigned long linearIdx = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    const unsigned long nCellSys  = nCells * localNSys;
+    if( linearIdx >= nCellSys ) return;
+
+    const unsigned long idxCell = linearIdx / localNSys;
+    const unsigned long idxSys  = linearIdx % localNSys;
+
+    double localFlux = 0.0;
+    for( unsigned long idxNbr = 0; idxNbr < nNbr; ++idxNbr ) {
+        const unsigned nbrCell = neighbors[Idx2DDevice( idxCell, idxNbr, nNbr )];
+        const double localInner =
+            quadPts[Idx2DDevice( idxSys, 0, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+            quadPts[Idx2DDevice( idxSys, 1, nDim )] * normals[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )];
+
+        if( boundaryTypes[idxCell] == neumannBoundary && nbrCell == nCells ) {
+            localFlux += ( localInner > 0.0 ) ? localInner * sol[Idx2DDevice( idxCell, idxSys, localNSys )]
+                                              : localInner * ghostCellValues[Idx2DDevice( idxCell, idxSys, localNSys )];
+            continue;
+        }
+
+        if( localInner > 0.0 ) {
+            const double recVal = sol[Idx2DDevice( idxCell, idxSys, localNSys )] +
+                                  limiter[Idx2DDevice( idxCell, idxSys, localNSys )] *
+                                      ( solDx[Idx3DDevice( idxCell, idxSys, 0, localNSys, nDim )] *
+                                            relativeInterfaceMid[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] +
+                                        solDx[Idx3DDevice( idxCell, idxSys, 1, localNSys, nDim )] *
+                                            relativeInterfaceMid[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )] );
+            localFlux += localInner * recVal;
+        }
+        else {
+            const unsigned long nbrCellUL = static_cast<unsigned long>( nbrCell );
+            const double recVal           = sol[Idx2DDevice( nbrCellUL, idxSys, localNSys )] +
+                                  limiter[Idx2DDevice( nbrCellUL, idxSys, localNSys )] *
+                                      ( solDx[Idx3DDevice( nbrCellUL, idxSys, 0, localNSys, nDim )] *
+                                            ( interfaceMidPoints[Idx3DDevice( idxCell, idxNbr, 0, nNbr, nDim )] -
+                                              cellMidPoints[Idx2DDevice( nbrCellUL, 0, nDim )] ) +
+                                        solDx[Idx3DDevice( nbrCellUL, idxSys, 1, localNSys, nDim )] *
+                                            ( interfaceMidPoints[Idx3DDevice( idxCell, idxNbr, 1, nNbr, nDim )] -
+                                              cellMidPoints[Idx2DDevice( nbrCellUL, 1, nDim )] ) );
+            localFlux += localInner * recVal;
+        }
+    }
+
+    flux[Idx2DDevice( idxCell, idxSys, localNSys )] = localFlux;
+}
+
+__global__ void FVMUpdateKernel( unsigned long nCells,
+                                 unsigned long localNSys,
+                                 double dT,
+                                 const double* __restrict__ sigmaS,
+                                 const double* __restrict__ sigmaT,
+                                 const double* __restrict__ source,
+                                 const double* __restrict__ areas,
+                                 const double* __restrict__ quadWeights,
+                                 const double* __restrict__ flux,
+                                 double* sol,
+                                 double* scalarFlux,
+                                 double invTwoPi ) {
+    const unsigned long idxCell = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    if( idxCell >= nCells ) return;
+
+    const double scattering = sigmaS[idxCell] * scalarFlux[idxCell] * invTwoPi;
+    double localScalarFlux  = 0.0;
+
+    for( unsigned long idxSys = 0; idxSys < localNSys; ++idxSys ) {
+        const unsigned long idx = Idx2DDevice( idxCell, idxSys, localNSys );
+        double newVal           = ( 1.0 - dT * sigmaT[idxCell] ) * sol[idx] - dT / areas[idxCell] * flux[idx] + dT * ( scattering + source[idx] );
+        newVal                  = fmax( newVal, 0.0 );
+        sol[idx]                = newVal;
+        localScalarFlux += newVal * quadWeights[idxSys];
+    }
+    scalarFlux[idxCell] = localScalarFlux;
+}
+
+__global__ void RK2AverageAndScalarFluxKernel( unsigned long nCells,
+                                                unsigned long localNSys,
+                                                const double* __restrict__ quadWeights,
+                                                const double* __restrict__ solRK0,
+                                                double* sol,
+                                                double* scalarFlux ) {
+    const unsigned long idxCell = static_cast<unsigned long>( blockIdx.x ) * blockDim.x + threadIdx.x;
+    if( idxCell >= nCells ) return;
+
+    double localScalarFlux = 0.0;
+    for( unsigned long idxSys = 0; idxSys < localNSys; ++idxSys ) {
+        const unsigned long idx = Idx2DDevice( idxCell, idxSys, localNSys );
+        const double avgVal      = 0.5 * ( solRK0[idx] + sol[idx] );
+        sol[idx]                 = avgVal;
+        localScalarFlux += avgVal * quadWeights[idxSys];
+    }
+    scalarFlux[idxCell] = localScalarFlux;
+}
+
+}    // namespace
+
+SNSolverHPCCUDA::SNSolverHPCCUDA( Config* settings ) {
+#ifdef IMPORT_MPI
+    // Initialize MPI
+    MPI_Comm_size( MPI_COMM_WORLD, &_numProcs );
+    MPI_Comm_rank( MPI_COMM_WORLD, &_rank );
+#endif
+#ifndef IMPORT_MPI
+    _numProcs = 1;    // default values
+    _rank     = 0;
+#endif
+    _settings       = settings;
+    _currTime       = 0.0;
+    _idx_start_iter = 0;
+    _nOutputMoments = 2;    //  Currently only u_1 (x direction) and u_1 (y direction) are supported
+    _cudaInitialized = false;
+    _cudaDeviceId    = 0;
+    _device          = nullptr;
+    // Create Mesh
+    _mesh = LoadSU2MeshFromFile( settings );
+    _settings->SetNCells( _mesh->GetNumCells() );
+    auto quad = QuadratureBase::Create( settings );
+    _settings->SetNQuadPoints( quad->GetNq() );
+
+    _nCells = static_cast<unsigned long>( _mesh->GetNumCells() );
+    _nNbr   = static_cast<unsigned long>( _mesh->GetNumNodesPerCell() );
+    _nDim   = static_cast<unsigned long>( _mesh->GetDim() );
+    _nNodes = static_cast<unsigned long>( _mesh->GetNumNodes() );
+
+    _nq   = static_cast<unsigned long>( quad->GetNq() );
+    _nSys = _nq;
+
+    if( static_cast<unsigned long>( _numProcs ) > _nq ) {
+        ErrorMessages::Error( "The number of processors must be less than or equal to the number of quadrature points.", CURRENT_FUNCTION );
+    }
+
+    if( _numProcs == 1 ) {
+        _localNSys   = _nSys;
+        _startSysIdx = 0;
+        _endSysIdx   = _nSys;
+    }
+    else {
+        _localNSys   = _nSys / ( _numProcs - 1 );
+        _startSysIdx = _rank * _localNSys;
+        _endSysIdx   = _rank * _localNSys + _localNSys;
+
+        if( _rank == _numProcs - 1 ) {
+            _localNSys = _nSys - _startSysIdx;
+            _endSysIdx = _nSys;
+        }
+    }
+
+    // std::cout << "Rank: " << _rank << " startSysIdx: " << _startSysIdx << " endSysIdx: " << _endSysIdx <<  " localNSys: " << _localNSys <<
+    // std::endl;
+
+    _spatialOrder  = _settings->GetSpatialOrder();
+    _temporalOrder = _settings->GetTemporalOrder();
+
+    _areas                  = std::vector<double>( _nCells );
+    _normals                = std::vector<double>( _nCells * _nNbr * _nDim );
+    _neighbors              = std::vector<unsigned>( _nCells * _nNbr );
+    _cellMidPoints          = std::vector<double>( _nCells * _nDim );
+    _interfaceMidPoints     = std::vector<double>( _nCells * _nNbr * _nDim );
+    _cellBoundaryTypes      = std::vector<BOUNDARY_TYPE>( _nCells );
+    _relativeInterfaceMidPt = std::vector<double>( _nCells * _nNbr * _nDim );
+    _relativeCellVertices   = std::vector<double>( _nCells * _nNbr * _nDim );
+
+    // Slope
+    _solDx   = std::vector<double>( _nCells * _localNSys * _nDim, 0.0 );
+    _limiter = std::vector<double>( _nCells * _localNSys, 0.0 );
+
+    // Physics
+    _sigmaS = std::vector<double>( _nCells );
+    _sigmaT = std::vector<double>( _nCells );
+    _source = std::vector<double>( _nCells * _localNSys );
+
+    // Quadrature
+    _quadPts     = std::vector<double>( _localNSys * _nDim );
+    _quadWeights = std::vector<double>( _localNSys );
+
+    // Solution
+    _sol  = std::vector<double>( _nCells * _localNSys );
+    _flux = std::vector<double>( _nCells * _localNSys );
+
+    _scalarFlux              = std::vector<double>( _nCells );
+    _scalarFluxPrevIter      = std::vector<double>( _nCells );
+    _localMaxOrdinateOutflow = std::vector<double>( _nCells );
+
+    auto areas           = _mesh->GetCellAreas();
+    auto neighbors       = _mesh->GetNeighbours();
+    auto normals         = _mesh->GetNormals();
+    auto cellMidPts      = _mesh->GetCellMidPoints();
+    auto interfaceMidPts = _mesh->GetInterfaceMidPoints();
+    auto boundaryTypes   = _mesh->GetBoundaryTypes();
+    auto nodes           = _mesh->GetNodes();
+    auto cells           = _mesh->GetCells();
+
+#pragma omp parallel for
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+        _areas[idx_cell] = areas[idx_cell];
+    }
+
+    _dT    = ComputeTimeStep( _settings->GetCFL() );
+    _nIter = unsigned( _settings->GetTEnd() / _dT ) + 1;
+
+    auto quadPoints  = quad->GetPoints();
+    auto quadWeights = quad->GetWeights();
+
+    _problem    = ProblemBase::Create( _settings, _mesh, quad );
+    auto sigmaT = _problem->GetTotalXS( Vector( _nIter, 0.0 ) );
+    auto sigmaS = _problem->GetScatteringXS( Vector( _nIter, 0.0 ) );
+    auto source = _problem->GetExternalSource( Vector( _nIter, 0.0 ) );
+
+    // Copy to everything to solver
+    _mass = 0;
+#pragma omp parallel for
+    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+        for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+            _quadPts[Idx2D( idx_sys, idx_dim, _nDim )] = quadPoints[idx_sys + _startSysIdx][idx_dim];
+        }
+        if( _settings->GetQuadName() == QUAD_GaussLegendreTensorized2D ) {
+            _quadWeights[idx_sys] =
+                2.0 * quadWeights[idx_sys + _startSysIdx];    // Rescaling of quadweights TODO: Check if this needs general refactoring
+        }
+        else {
+            _quadWeights[idx_sys] = quadWeights[idx_sys + _startSysIdx];    // Rescaling of quadweights TODO: Check if this needs general refactoring}
+        }
+    }
+
+#pragma omp parallel for
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+        _cellBoundaryTypes[idx_cell] = boundaryTypes[idx_cell];
+
+        for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+            _cellMidPoints[Idx2D( idx_cell, idx_dim, _nDim )] = cellMidPts[idx_cell][idx_dim];
+        }
+
+        for( unsigned long idx_nbr = 0; idx_nbr < _nNbr; idx_nbr++ ) {
+
+            _neighbors[Idx2D( idx_cell, idx_nbr, _nNbr )] = neighbors[idx_cell][idx_nbr];
+
+            for( unsigned long idx_dim = 0; idx_dim < _nDim; idx_dim++ ) {
+                _normals[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )]            = normals[idx_cell][idx_nbr][idx_dim];
+                _interfaceMidPoints[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] = interfaceMidPts[idx_cell][idx_nbr][idx_dim];
+                _relativeInterfaceMidPt[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] =
+                    _interfaceMidPoints[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] - _cellMidPoints[Idx2D( idx_cell, idx_dim, _nDim )];
+                _relativeCellVertices[Idx3D( idx_cell, idx_nbr, idx_dim, _nNbr, _nDim )] =
+                    nodes[cells[idx_cell][idx_nbr]][idx_dim] - cellMidPts[idx_cell][idx_dim];
+            }
+        }
+
+        _sigmaS[idx_cell]     = sigmaS[0][idx_cell];
+        _sigmaT[idx_cell]     = sigmaT[0][idx_cell];
+        _scalarFlux[idx_cell] = 0;
+
+        for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+            _source[Idx2D( idx_cell, idx_sys, _localNSys )] = source[0][idx_cell][0];    // CAREFUL HERE hardcoded to isotropic source
+            _sol[Idx2D( idx_cell, idx_sys, _localNSys )]    = 0.0;                       // initial condition is zero
+
+            _scalarFlux[idx_cell] += _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+        }
+        // _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
+    }
+
+    // Lattice
+    {
+        _curAbsorptionLattice    = 0;
+        _totalAbsorptionLattice  = 0;
+        _curMaxAbsorptionLattice = 0;
+        _curScalarOutflow        = 0;
+        _totalScalarOutflow      = 0;
+        _curMaxOrdinateOutflow   = 0;
+        _curScalarOutflowPeri1   = 0;
+        _totalScalarOutflowPeri1 = 0;
+        _curScalarOutflowPeri2   = 0;
+        _totalScalarOutflowPeri2 = 0;
+        ComputeCellsPerimeterLattice();
+    }
+    // Hohlraum
+    {
+        _totalAbsorptionHohlraumCenter     = 0;
+        _totalAbsorptionHohlraumVertical   = 0;
+        _totalAbsorptionHohlraumHorizontal = 0;
+        _curAbsorptionHohlraumCenter       = 0;
+        _curAbsorptionHohlraumVertical     = 0;
+        _curAbsorptionHohlraumHorizontal   = 0;
+        _varAbsorptionHohlraumGreen        = 0;
+    }
+
+    if( _settings->GetLoadRestartSolution() )
+        _idx_start_iter = LoadRestartSolution( _settings->GetOutputFile(),
+                                               _sol,
+                                               _scalarFlux,
+                                               _rank,
+                                               _nCells,
+                                               _totalAbsorptionHohlraumCenter,
+                                               _totalAbsorptionHohlraumVertical,
+                                               _totalAbsorptionHohlraumHorizontal,
+                                               _totalAbsorptionLattice ) +
+                          1;
+
+#ifdef IMPORT_MPI
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    SetGhostCells();
+    InitCUDA();
+    UploadStaticDataToDevice();
+    UploadStateToDevice();
+
+    if( _rank == 0 ) {
+        PrepareScreenOutput();     // Screen Output
+        PrepareHistoryOutput();    // History Output
+    }
+#ifdef IMPORT_MPI
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    delete quad;
+
+    // Initialiye QOIS
+    _mass    = 0;
+    _rmsFlux = 0;
+
+    {    // Hohlraum
+
+        unsigned n_probes = 4;
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+        // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+
+        _probingMoments = std::vector<double>( n_probes * 3, 0. );    // 10 time horizons
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+            _probingCellsHohlraum = {
+                _mesh->GetCellsofBall( -0.4, 0., 0.01 ),
+                _mesh->GetCellsofBall( 0.4, 0., 0.01 ),
+                _mesh->GetCellsofBall( 0., -0.5, 0.01 ),
+                _mesh->GetCellsofBall( 0., 0.5, 0.01 ),
+            };
+        }
+        // else if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+        //     _probingCellsHohlraum = {
+        //         _mesh->GetCellsofBall( 0.4, 0., 0.01 ),
+        //         _mesh->GetCellsofBall( 0., 0.5, 0.01 ),
+        //     };
+        // }
+
+        // Green
+        _thicknessGreen = 0.05;
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+            _centerGreen           = { _settings->GetPosXCenterGreenHohlraum(), _settings->GetPosYCenterGreenHohlraum() };
+            _cornerUpperLeftGreen  = { -0.2 + _centerGreen[0], 0.4 + _centerGreen[1] };
+            _cornerLowerLeftGreen  = { -0.2 + _centerGreen[0], -0.4 + _centerGreen[1] };
+            _cornerUpperRightGreen = { 0.2 + _centerGreen[0], 0.4 + _centerGreen[1] };
+            _cornerLowerRightGreen = { 0.2 + _centerGreen[0], -0.4 + _centerGreen[1] };
+        }
+        // else {
+        //     _centerGreen           = { 0.0, 0.0 };
+        //     _cornerUpperLeftGreen  = { 0., 0.4 - _thicknessGreen / 2.0 };
+        //     _cornerLowerLeftGreen  = { 0., +_thicknessGreen / 2.0 };
+        //     _cornerUpperRightGreen = { 0.2 - _thicknessGreen / 2.0, 0.4 - _thicknessGreen / 2.0 };
+        //     _cornerLowerRightGreen = { 0.2 - _thicknessGreen / 2.0, 0. + _thicknessGreen / 2.0 };
+        // }
+
+        _nProbingCellsLineGreen = _settings->GetNumProbingCellsLineHohlraum();
+
+        _nProbingCellsBlocksGreen  = 44;
+        _absorptionValsBlocksGreen = std::vector<double>( _nProbingCellsBlocksGreen, 0. );
+        _absorptionValsLineSegment = std::vector<double>( _nProbingCellsLineGreen, 0.0 );
+
+        SetProbingCellsLineGreen();    // ONLY FOR HOHLRAUM
+    }
+}
+
+void SNSolverHPCCUDA::InitCUDA() {
+    int nDevices = 0;
+    CheckCuda( cudaGetDeviceCount( &nDevices ), "cudaGetDeviceCount" );
+    if( nDevices < 1 ) {
+        ErrorMessages::Error( "No CUDA-capable GPU detected, but SNSolverHPCCUDA was requested.", CURRENT_FUNCTION );
+    }
+
+    _cudaDeviceId = 0;    // first version: pin to one GPU
+    CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+
+    _device = new DeviceBuffers();
+
+    const std::size_t nCells      = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellNbr    = nCells * static_cast<std::size_t>( _nNbr );
+    const std::size_t nCellNbrDim = nCellNbr * static_cast<std::size_t>( _nDim );
+    const std::size_t nCellSys    = nCells * static_cast<std::size_t>( _localNSys );
+    const std::size_t nSysDim     = static_cast<std::size_t>( _localNSys ) * static_cast<std::size_t>( _nDim );
+
+    AllocateDeviceArray( &_device->areas, nCells, "areas" );
+    AllocateDeviceArray( &_device->normals, nCellNbrDim, "normals" );
+    AllocateDeviceArray( &_device->neighbors, nCellNbr, "neighbors" );
+    AllocateDeviceArray( &_device->boundaryTypes, nCells, "boundaryTypes" );
+    AllocateDeviceArray( &_device->ghostCellValues, nCellSys, "ghostCellValues" );
+    AllocateDeviceArray( &_device->cellMidPoints, nCells * static_cast<std::size_t>( _nDim ), "cellMidPoints" );
+    AllocateDeviceArray( &_device->interfaceMidPoints, nCellNbrDim, "interfaceMidPoints" );
+    AllocateDeviceArray( &_device->relativeInterfaceMid, nCellNbrDim, "relativeInterfaceMid" );
+    AllocateDeviceArray( &_device->relativeCellVertices, nCellNbrDim, "relativeCellVertices" );
+
+    AllocateDeviceArray( &_device->sigmaS, nCells, "sigmaS" );
+    AllocateDeviceArray( &_device->sigmaT, nCells, "sigmaT" );
+    AllocateDeviceArray( &_device->source, nCellSys, "source" );
+    AllocateDeviceArray( &_device->quadPts, nSysDim, "quadPts" );
+    AllocateDeviceArray( &_device->quadWeights, static_cast<std::size_t>( _localNSys ), "quadWeights" );
+
+    AllocateDeviceArray( &_device->sol, nCellSys, "sol" );
+    AllocateDeviceArray( &_device->solRK0, nCellSys, "solRK0" );
+    AllocateDeviceArray( &_device->flux, nCellSys, "flux" );
+    AllocateDeviceArray( &_device->scalarFlux, nCells, "scalarFlux" );
+    AllocateDeviceArray( &_device->solDx, nCellSys * static_cast<std::size_t>( _nDim ), "solDx" );
+    AllocateDeviceArray( &_device->limiter, nCellSys, "limiter" );
+
+    _cudaInitialized = true;
+}
+
+void SNSolverHPCCUDA::UploadStaticDataToDevice() {
+    if( !_cudaInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "CUDA backend was not initialized before UploadStaticDataToDevice.", CURRENT_FUNCTION );
+    }
+
+    _cellBoundaryTypesInt.resize( _nCells, static_cast<int>( BOUNDARY_TYPE::INVALID ) );
+#pragma omp parallel for
+    for( unsigned long idxCell = 0; idxCell < _nCells; ++idxCell ) {
+        _cellBoundaryTypesInt[idxCell] = static_cast<int>( _cellBoundaryTypes[idxCell] );
+    }
+
+    _ghostCellValues.assign( static_cast<std::size_t>( _nCells ) * static_cast<std::size_t>( _localNSys ), 0.0 );
+    for( const auto& cellEntry : _ghostCells ) {
+        const unsigned idxCell             = cellEntry.first;
+        const std::vector<double>& values  = cellEntry.second;
+        const unsigned long nGhostOrdinate = std::min<unsigned long>( _localNSys, static_cast<unsigned long>( values.size() ) );
+        for( unsigned long idxSys = 0; idxSys < nGhostOrdinate; ++idxSys ) {
+            _ghostCellValues[Idx2D( idxCell, idxSys, _localNSys )] = values[idxSys];
+        }
+    }
+
+    const std::size_t nCells      = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellNbr    = nCells * static_cast<std::size_t>( _nNbr );
+    const std::size_t nCellNbrDim = nCellNbr * static_cast<std::size_t>( _nDim );
+    const std::size_t nCellSys    = nCells * static_cast<std::size_t>( _localNSys );
+    const std::size_t nSysDim     = static_cast<std::size_t>( _localNSys ) * static_cast<std::size_t>( _nDim );
+
+    CheckCuda( cudaMemcpy( _device->areas, _areas.data(), nCells * sizeof( double ), cudaMemcpyHostToDevice ), "copy areas" );
+    CheckCuda( cudaMemcpy( _device->normals, _normals.data(), nCellNbrDim * sizeof( double ), cudaMemcpyHostToDevice ), "copy normals" );
+    CheckCuda( cudaMemcpy( _device->neighbors, _neighbors.data(), nCellNbr * sizeof( unsigned ), cudaMemcpyHostToDevice ), "copy neighbors" );
+    CheckCuda( cudaMemcpy( _device->boundaryTypes, _cellBoundaryTypesInt.data(), nCells * sizeof( int ), cudaMemcpyHostToDevice ),
+               "copy boundary types" );
+    CheckCuda( cudaMemcpy( _device->ghostCellValues, _ghostCellValues.data(), nCellSys * sizeof( double ), cudaMemcpyHostToDevice ),
+               "copy ghost values" );
+    CheckCuda(
+        cudaMemcpy( _device->cellMidPoints, _cellMidPoints.data(), nCells * static_cast<std::size_t>( _nDim ) * sizeof( double ), cudaMemcpyHostToDevice ),
+        "copy cell midpoints" );
+    CheckCuda( cudaMemcpy( _device->interfaceMidPoints, _interfaceMidPoints.data(), nCellNbrDim * sizeof( double ), cudaMemcpyHostToDevice ),
+               "copy interface midpoints" );
+    CheckCuda( cudaMemcpy( _device->relativeInterfaceMid, _relativeInterfaceMidPt.data(), nCellNbrDim * sizeof( double ), cudaMemcpyHostToDevice ),
+               "copy relative interface points" );
+    CheckCuda( cudaMemcpy( _device->relativeCellVertices, _relativeCellVertices.data(), nCellNbrDim * sizeof( double ), cudaMemcpyHostToDevice ),
+               "copy relative vertices" );
+    CheckCuda( cudaMemcpy( _device->sigmaS, _sigmaS.data(), nCells * sizeof( double ), cudaMemcpyHostToDevice ), "copy sigmaS" );
+    CheckCuda( cudaMemcpy( _device->sigmaT, _sigmaT.data(), nCells * sizeof( double ), cudaMemcpyHostToDevice ), "copy sigmaT" );
+    CheckCuda( cudaMemcpy( _device->source, _source.data(), nCellSys * sizeof( double ), cudaMemcpyHostToDevice ), "copy source" );
+    CheckCuda( cudaMemcpy( _device->quadPts, _quadPts.data(), nSysDim * sizeof( double ), cudaMemcpyHostToDevice ), "copy quad points" );
+    CheckCuda( cudaMemcpy( _device->quadWeights, _quadWeights.data(), static_cast<std::size_t>( _localNSys ) * sizeof( double ), cudaMemcpyHostToDevice ),
+               "copy quad weights" );
+
+    CheckCuda( cudaMemset( _device->flux, 0, nCellSys * sizeof( double ) ), "init flux" );
+    CheckCuda( cudaMemcpy( _device->solDx, _solDx.data(), nCellSys * static_cast<std::size_t>( _nDim ) * sizeof( double ), cudaMemcpyHostToDevice ),
+               "init solDx" );
+    CheckCuda( cudaMemcpy( _device->limiter, _limiter.data(), nCellSys * sizeof( double ), cudaMemcpyHostToDevice ), "init limiter" );
+}
+
+void SNSolverHPCCUDA::UploadStateToDevice() {
+    if( !_cudaInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "CUDA backend was not initialized before UploadStateToDevice.", CURRENT_FUNCTION );
+    }
+    const std::size_t nCells   = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellSys = nCells * static_cast<std::size_t>( _localNSys );
+
+    CheckCuda( cudaMemcpy( _device->sol, _sol.data(), nCellSys * sizeof( double ), cudaMemcpyHostToDevice ), "upload sol" );
+    CheckCuda( cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), nCells * sizeof( double ), cudaMemcpyHostToDevice ), "upload scalar flux" );
+}
+
+void SNSolverHPCCUDA::DownloadStateFromDevice() {
+    if( !_cudaInitialized || _device == nullptr ) {
+        ErrorMessages::Error( "CUDA backend was not initialized before DownloadStateFromDevice.", CURRENT_FUNCTION );
+    }
+    const std::size_t nCells   = static_cast<std::size_t>( _nCells );
+    const std::size_t nCellSys = nCells * static_cast<std::size_t>( _localNSys );
+
+    CheckCuda( cudaMemcpy( _sol.data(), _device->sol, nCellSys * sizeof( double ), cudaMemcpyDeviceToHost ), "download sol" );
+    CheckCuda( cudaMemcpy( _scalarFlux.data(), _device->scalarFlux, nCells * sizeof( double ), cudaMemcpyDeviceToHost ), "download scalar flux" );
+}
+
+void SNSolverHPCCUDA::FreeCUDA() {
+    if( !_cudaInitialized || _device == nullptr ) {
+        return;
+    }
+
+    CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+
+    FreeDeviceArray( _device->areas, "areas" );
+    FreeDeviceArray( _device->normals, "normals" );
+    FreeDeviceArray( _device->neighbors, "neighbors" );
+    FreeDeviceArray( _device->boundaryTypes, "boundaryTypes" );
+    FreeDeviceArray( _device->ghostCellValues, "ghostCellValues" );
+    FreeDeviceArray( _device->cellMidPoints, "cellMidPoints" );
+    FreeDeviceArray( _device->interfaceMidPoints, "interfaceMidPoints" );
+    FreeDeviceArray( _device->relativeInterfaceMid, "relativeInterfaceMid" );
+    FreeDeviceArray( _device->relativeCellVertices, "relativeCellVertices" );
+    FreeDeviceArray( _device->sigmaS, "sigmaS" );
+    FreeDeviceArray( _device->sigmaT, "sigmaT" );
+    FreeDeviceArray( _device->source, "source" );
+    FreeDeviceArray( _device->quadPts, "quadPts" );
+    FreeDeviceArray( _device->quadWeights, "quadWeights" );
+    FreeDeviceArray( _device->sol, "sol" );
+    FreeDeviceArray( _device->solRK0, "solRK0" );
+    FreeDeviceArray( _device->flux, "flux" );
+    FreeDeviceArray( _device->scalarFlux, "scalarFlux" );
+    FreeDeviceArray( _device->solDx, "solDx" );
+    FreeDeviceArray( _device->limiter, "limiter" );
+
+    delete _device;
+    _device          = nullptr;
+    _cudaInitialized = false;
+}
+
+SNSolverHPCCUDA::~SNSolverHPCCUDA() {
+    FreeCUDA();
+    delete _mesh;
+    delete _problem;
+}
+
+void SNSolverHPCCUDA::Solve() {
+
+    // --- Preprocessing ---
+    if( _rank == 0 ) {
+        PrepareVolumeOutput();
+        DrawPreSolverOutput();
+    }
+    // On restart, continue simulation time from the loaded iteration index.
+    _curSimTime = static_cast<double>( _idx_start_iter ) * _dT;
+    auto start  = std::chrono::high_resolution_clock::now();    // Start timing
+
+    std::chrono::duration<double> duration;
+    // Loop over energies (pseudo-time of continuous slowing down approach)
+    for( unsigned iter = (unsigned)_idx_start_iter; iter < _nIter; iter++ ) {
+
+        if( iter == _nIter - 1 ) {    // last iteration
+            _dT = _settings->GetTEnd() - iter * _dT;
+        }
+        _scalarFluxPrevIter = _scalarFlux;
+
+        if( _temporalOrder == 2 ) {
+            CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+            CheckCuda( cudaMemcpy( _device->solRK0,
+                                   _device->sol,
+                                   static_cast<std::size_t>( _nCells ) * static_cast<std::size_t>( _localNSys ) * sizeof( double ),
+                                   cudaMemcpyDeviceToDevice ),
+                       "backup solRK0" );
+
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+
+            const dim3 threads( CUDA_BLOCK_SIZE );
+            const dim3 gridCells( static_cast<unsigned>( ( _nCells + CUDA_BLOCK_SIZE - 1 ) / CUDA_BLOCK_SIZE ) );
+            RK2AverageAndScalarFluxKernel<<<gridCells, threads>>>(
+                _nCells, _localNSys, _device->quadWeights, _device->solRK0, _device->sol, _device->scalarFlux );
+            CheckCuda( cudaGetLastError(), "RK2AverageAndScalarFluxKernel launch" );
+        }
+        else {
+            ( _spatialOrder == 2 ) ? FluxOrder2() : FluxOrder1();
+            FVMUpdate();
+        }
+
+        DownloadStateFromDevice();
+
+        _curSimTime += _dT;
+        IterPostprocessing();
+        // --- Wall time measurement
+        duration  = std::chrono::high_resolution_clock::now() - start;
+        _currTime = std::chrono::duration_cast<std::chrono::duration<double>>( duration ).count();
+
+        // --- Write Output ---
+        if( _rank == 0 ) {
+
+            WriteScalarOutput( iter );
+
+            // --- Update Scalar Fluxes
+
+            // --- Print Output ---
+            PrintScreenOutput( iter );
+            PrintHistoryOutput( iter );
+        }
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+
+        PrintVolumeOutput( iter );
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    }
+    // --- Postprocessing ---
+    if( _rank == 0 ) {
+        DrawPostSolverOutput();
+    }
+}
+
+void SNSolverHPCCUDA::FluxOrder2() {
+    CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+    const unsigned long nCellSys = _nCells * _localNSys;
+    const dim3 threads( CUDA_BLOCK_SIZE );
+    const dim3 gridCellSys( static_cast<unsigned>( ( nCellSys + CUDA_BLOCK_SIZE - 1 ) / CUDA_BLOCK_SIZE ) );
+    const double eps = 1e-10;
+
+    FluxOrder2SlopeKernel<<<gridCellSys, threads>>>( _nCells,
+                                                     _localNSys,
+                                                     _nNbr,
+                                                     _nDim,
+                                                     _device->areas,
+                                                     _device->normals,
+                                                     _device->neighbors,
+                                                     _device->boundaryTypes,
+                                                     _device->sol,
+                                                     _device->solDx,
+                                                     _device->limiter,
+                                                     static_cast<int>( BOUNDARY_TYPE::NONE ) );
+    CheckCuda( cudaGetLastError(), "FluxOrder2SlopeKernel launch" );
+
+    FluxOrder2LimiterKernel<<<gridCellSys, threads>>>( _nCells,
+                                                       _localNSys,
+                                                       _nNbr,
+                                                       _nDim,
+                                                       _device->areas,
+                                                       _device->neighbors,
+                                                       _device->boundaryTypes,
+                                                       _device->relativeCellVertices,
+                                                       _device->sol,
+                                                       _device->solDx,
+                                                       _device->limiter,
+                                                       static_cast<int>( BOUNDARY_TYPE::NONE ),
+                                                       eps );
+    CheckCuda( cudaGetLastError(), "FluxOrder2LimiterKernel launch" );
+
+    FluxOrder2Kernel<<<gridCellSys, threads>>>( _nCells,
+                                                _localNSys,
+                                                _nNbr,
+                                                _nDim,
+                                                _device->quadPts,
+                                                _device->normals,
+                                                _device->neighbors,
+                                                _device->boundaryTypes,
+                                                _device->ghostCellValues,
+                                                _device->interfaceMidPoints,
+                                                _device->cellMidPoints,
+                                                _device->relativeInterfaceMid,
+                                                _device->sol,
+                                                _device->solDx,
+                                                _device->limiter,
+                                                _device->flux,
+                                                static_cast<int>( BOUNDARY_TYPE::NEUMANN ) );
+    CheckCuda( cudaGetLastError(), "FluxOrder2Kernel launch" );
+}
+
+void SNSolverHPCCUDA::FluxOrder1() {
+    CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+    const unsigned long nCellSys = _nCells * _localNSys;
+    const dim3 threads( CUDA_BLOCK_SIZE );
+    const dim3 gridCellSys( static_cast<unsigned>( ( nCellSys + CUDA_BLOCK_SIZE - 1 ) / CUDA_BLOCK_SIZE ) );
+
+    FluxOrder1Kernel<<<gridCellSys, threads>>>( _nCells,
+                                                _localNSys,
+                                                _nNbr,
+                                                _nDim,
+                                                _device->quadPts,
+                                                _device->normals,
+                                                _device->neighbors,
+                                                _device->boundaryTypes,
+                                                _device->ghostCellValues,
+                                                _device->sol,
+                                                _device->flux,
+                                                static_cast<int>( BOUNDARY_TYPE::NEUMANN ) );
+    CheckCuda( cudaGetLastError(), "FluxOrder1Kernel launch" );
+}
+
+void SNSolverHPCCUDA::FVMUpdate() {
+    CheckCuda( cudaSetDevice( _cudaDeviceId ), "cudaSetDevice" );
+
+    const dim3 threads( CUDA_BLOCK_SIZE );
+    const dim3 gridCells( static_cast<unsigned>( ( _nCells + CUDA_BLOCK_SIZE - 1 ) / CUDA_BLOCK_SIZE ) );
+    const double invTwoPi = 1.0 / static_cast<double>( 2.0L * PI );
+
+    FVMUpdateKernel<<<gridCells, threads>>>( _nCells,
+                                             _localNSys,
+                                             _dT,
+                                             _device->sigmaS,
+                                             _device->sigmaT,
+                                             _device->source,
+                                             _device->areas,
+                                             _device->quadWeights,
+                                             _device->flux,
+                                             _device->sol,
+                                             _device->scalarFlux,
+                                             invTwoPi );
+    CheckCuda( cudaGetLastError(), "FVMUpdateKernel launch" );
+    CheckCuda( cudaMemcpy( _scalarFlux.data(),
+                           _device->scalarFlux,
+                           static_cast<std::size_t>( _nCells ) * sizeof( double ),
+                           cudaMemcpyDeviceToHost ),
+               "download scalar flux in FVMUpdate" );
+
+#ifdef IMPORT_MPI
+    std::vector<double> tempScalarFlux( _scalarFlux );
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( tempScalarFlux.data(), _scalarFlux.data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    MPI_Barrier( MPI_COMM_WORLD );
+    CheckCuda(
+        cudaMemcpy( _device->scalarFlux, _scalarFlux.data(), static_cast<std::size_t>( _nCells ) * sizeof( double ), cudaMemcpyHostToDevice ),
+        "sync allreduced scalar flux to device" );
+#endif
+}
+
+void SNSolverHPCCUDA::IterPostprocessing() {
+    // ALREDUCE NEEDED
+    _mass    = 0.0;
+    _rmsFlux = 0.0;
+#pragma omp parallel for reduction( + : _mass, _rmsFlux )
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+        _mass += _scalarFlux[idx_cell] * _areas[idx_cell];
+        double diff = _scalarFlux[idx_cell] - _scalarFluxPrevIter[idx_cell];
+        _rmsFlux += diff * diff;
+    }
+
+    _curAbsorptionLattice            = 0.0;
+    _curScalarOutflow                = 0.0;
+    _curScalarOutflowPeri1           = 0.0;
+    _curScalarOutflowPeri2           = 0.0;
+    _curAbsorptionHohlraumCenter     = 0.0;    // Green and blue areas of symmetric hohlraum
+    _curAbsorptionHohlraumVertical   = 0.0;    // Red areas of symmetric hohlraum
+    _curAbsorptionHohlraumHorizontal = 0.0;    // Black areas of symmetric hohlraum
+    _varAbsorptionHohlraumGreen      = 0.0;
+    double a_g                       = 0.0;
+
+#pragma omp parallel for reduction( + : _curAbsorptionLattice,                                                                                       \
+                                        _curScalarOutflow,                                                                                           \
+                                        _curScalarOutflowPeri1,                                                                                      \
+                                        _curScalarOutflowPeri2,                                                                                      \
+                                        _curAbsorptionHohlraumCenter,                                                                                \
+                                        _curAbsorptionHohlraumVertical,                                                                              \
+                                        _curAbsorptionHohlraumHorizontal,                                                                            \
+                                        a_g ) reduction( max : _curMaxOrdinateOutflow, _curMaxAbsorptionLattice )
+    for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+
+        if( _settings->GetProblemName() == PROBLEM_Lattice || _settings->GetProblemName() == PROBLEM_HalfLattice ) {
+            if( IsAbsorptionLattice( _cellMidPoints[Idx2D( idx_cell, 0, _nDim )], _cellMidPoints[Idx2D( idx_cell, 1, _nDim )] ) ) {
+                double sigmaAPsi = _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+                _curAbsorptionLattice += sigmaAPsi;
+                _curMaxAbsorptionLattice = ( _curMaxAbsorptionLattice < sigmaAPsi ) ? sigmaAPsi : _curMaxAbsorptionLattice;
+            }
+        }
+
+        if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {    //} || _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+
+            double x = _cellMidPoints[Idx2D( idx_cell, 0, _nDim )];
+            double y = _cellMidPoints[Idx2D( idx_cell, 1, _nDim )];
+            _curAbsorptionLattice += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            if( x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum() ) {
+                _curAbsorptionHohlraumCenter += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+            if( ( x < _settings->GetPosRedLeftBorderHohlraum() && y > _settings->GetPosRedLeftBottomHohlraum() &&
+                  y < _settings->GetPosRedLeftTopHohlraum() ) ||
+                ( x > _settings->GetPosRedRightBorderHohlraum() && y > _settings->GetPosRedLeftBottomHohlraum() &&
+                  y < _settings->GetPosRedRightTopHohlraum() ) ) {
+                _curAbsorptionHohlraumVertical += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+            if( y > 0.6 || y < -0.6 ) {
+                _curAbsorptionHohlraumHorizontal += _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _areas[idx_cell];
+            }
+
+            // Variation in absorption of center (part 1)
+            // green area 1 (lower boundary)
+            bool green1 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < -0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 2 (upper boundary)
+            bool green2 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > 0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 3 (left boundary)
+            bool green3 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < -0.15 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 4 (right boundary)
+            bool green4 = x > 0.15 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            if( green1 || green2 || green3 || green4 ) {
+                a_g += ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) * _scalarFlux[idx_cell] * _areas[idx_cell] /
+                       ( 44 * 0.05 * 0.05 );    // divisor is area of the green
+            }
+        }
+
+        if( _settings->GetProblemName() == PROBLEM_Lattice ) {
+            // Outflow out of inner and middle perimeter
+            if( _isPerimeterLatticeCell1[idx_cell] ) {    // inner
+                for( unsigned long idx_nbr_helper = 0; idx_nbr_helper < _cellsLatticePerimeter1[idx_cell].size(); ++idx_nbr_helper ) {
+#pragma omp simd reduction( + : _curScalarOutflowPeri1 )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter1[idx_cell][idx_nbr_helper], 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter1[idx_cell][idx_nbr_helper], 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflowPeri1 +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+                        }
+                    }
+                }
+            }
+            if( _isPerimeterLatticeCell2[idx_cell] ) {    // middle
+                for( unsigned long idx_nbr_helper = 0; idx_nbr_helper < _cellsLatticePerimeter2[idx_cell].size(); ++idx_nbr_helper ) {
+#pragma omp simd reduction( + : _curScalarOutflowPeri2 )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter2[idx_cell][idx_nbr_helper], 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] *
+                                                _normals[Idx3D( idx_cell, _cellsLatticePerimeter2[idx_cell][idx_nbr_helper], 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflowPeri2 +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+                        }
+                    }
+                }
+            }
+        }
+        // Outflow out of domain
+        if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN ) {
+            // Iterate over face cell faces
+            double currOrdinatewiseOutflow = 0.0;
+
+            for( unsigned long idx_nbr = 0; idx_nbr < _nNbr; ++idx_nbr ) {
+                // Find face that points outward
+                if( _neighbors[Idx2D( idx_cell, idx_nbr, _nNbr )] == _nCells ) {
+#pragma omp simd reduction( + : _curScalarOutflow ) reduction( max : _curMaxOrdinateOutflow )
+                    for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+
+                        double localInner = _quadPts[Idx2D( idx_sys, 0, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] +
+                                            _quadPts[Idx2D( idx_sys, 1, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )];
+                        // Find outward facing transport directions
+
+                        if( localInner > 0.0 ) {
+                            _curScalarOutflow +=
+                                localInner * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];    // Integrate flux
+
+                            currOrdinatewiseOutflow =
+                                _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * localInner /
+                                sqrt( (
+                                    _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 0, _nNbr, _nDim )] +
+                                    _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )] * _normals[Idx3D( idx_cell, idx_nbr, 1, _nNbr, _nDim )] ) );
+
+                            _curMaxOrdinateOutflow =
+                                ( currOrdinatewiseOutflow > _curMaxOrdinateOutflow ) ? currOrdinatewiseOutflow : _curMaxOrdinateOutflow;
+                        }
+                    }
+                }
+            }
+        }
+    }
+// MPI Allreduce
+#ifdef IMPORT_MPI
+    double tmp_curScalarOutflow      = 0.0;
+    double tmp_curScalarOutflowPeri1 = 0.0;
+    double tmp_curScalarOutflowPeri2 = 0.0;
+    double tmp_curMaxOrdinateOutflow = 0.0;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflow, &tmp_curScalarOutflow, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflow = tmp_curScalarOutflow;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflowPeri1, &tmp_curScalarOutflowPeri1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflowPeri1 = tmp_curScalarOutflowPeri1;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curScalarOutflowPeri2, &tmp_curScalarOutflowPeri2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+    _curScalarOutflowPeri2 = tmp_curScalarOutflowPeri2;
+    MPI_Barrier( MPI_COMM_WORLD );
+    MPI_Allreduce( &_curMaxOrdinateOutflow, &tmp_curMaxOrdinateOutflow, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD );
+    _curMaxOrdinateOutflow = tmp_curMaxOrdinateOutflow;
+    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+    // Variation absorption (part II)
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        unsigned n_probes = 4;
+        std::vector<double> temp_probingMoments( 3 * n_probes );    // for MPI allreduce
+
+#pragma omp parallel for reduction( + : _varAbsorptionHohlraumGreen )
+        for( unsigned long idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+            double x = _cellMidPoints[Idx2D( idx_cell, 0, _nDim )];
+            double y = _cellMidPoints[Idx2D( idx_cell, 1, _nDim )];
+
+            // green area 1 (lower boundary)
+            bool green1 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.4 + _settings->GetPosYCenterGreenHohlraum() && y < -0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 2 (upper boundary)
+            bool green2 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > 0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.4 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 3 (left boundary)
+            bool green3 = x > -0.2 + _settings->GetPosXCenterGreenHohlraum() && x < -0.15 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            // green area 4 (right boundary)
+            bool green4 = x > 0.15 + _settings->GetPosXCenterGreenHohlraum() && x < 0.2 + _settings->GetPosXCenterGreenHohlraum() &&
+                          y > -0.35 + _settings->GetPosYCenterGreenHohlraum() && y < 0.35 + _settings->GetPosYCenterGreenHohlraum();
+            if( green1 || green2 || green3 || green4 ) {
+                _varAbsorptionHohlraumGreen += ( a_g - _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) ) *
+                                               ( a_g - _scalarFlux[idx_cell] * ( _sigmaT[idx_cell] - _sigmaS[idx_cell] ) ) * _areas[idx_cell];
+            }
+        }
+        // Probes value moments
+        // #pragma omp parallel for
+        for( unsigned long idx_probe = 0; idx_probe < n_probes; idx_probe++ ) {    // Loop over probing cells
+            temp_probingMoments[Idx2D( idx_probe, 0, 3 )] = 0.0;
+            temp_probingMoments[Idx2D( idx_probe, 1, 3 )] = 0.0;
+            temp_probingMoments[Idx2D( idx_probe, 2, 3 )] = 0.0;
+            // for( unsigned long idx_ball = 0; idx_ball < _probingCellsHohlraum[idx_probe].size(); idx_ball++ ) {
+            //   std::cout << idx_ball << _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI ) << std::endl;
+            //}
+            for( unsigned long idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                for( unsigned long idx_ball = 0; idx_ball < _probingCellsHohlraum[idx_probe].size(); idx_ball++ ) {
+                    temp_probingMoments[Idx2D( idx_probe, 0, 3 )] += _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                                                                     _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] /
+                                                                     ( 0.01 * 0.01 * M_PI );
+                    temp_probingMoments[Idx2D( idx_probe, 1, 3 )] +=
+                        _quadPts[Idx2D( idx_sys, 0, _nDim )] * _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                        _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI );
+                    temp_probingMoments[Idx2D( idx_probe, 2, 3 )] +=
+                        _quadPts[Idx2D( idx_sys, 1, _nDim )] * _sol[Idx2D( _probingCellsHohlraum[idx_probe][idx_ball], idx_sys, _localNSys )] *
+                        _quadWeights[idx_sys] * _areas[_probingCellsHohlraum[idx_probe][idx_ball]] / ( 0.01 * 0.01 * M_PI );
+                }
+            }
+        }
+
+#ifdef IMPORT_MPI
+        MPI_Barrier( MPI_COMM_WORLD );
+        MPI_Allreduce( temp_probingMoments.data(), _probingMoments.data(), 3 * n_probes, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+        MPI_Barrier( MPI_COMM_WORLD );
+#endif
+#ifndef IMPORT_MPI
+        for( unsigned long idx_probe = 0; idx_probe < n_probes; idx_probe++ ) {    // Loop over probing cells
+            _probingMoments[Idx2D( idx_probe, 0, 3 )] = temp_probingMoments[Idx2D( idx_probe, 0, 3 )];
+            _probingMoments[Idx2D( idx_probe, 1, 3 )] = temp_probingMoments[Idx2D( idx_probe, 1, 3 )];
+            _probingMoments[Idx2D( idx_probe, 2, 3 )] = temp_probingMoments[Idx2D( idx_probe, 2, 3 )];
+        }
+#endif
+    }
+    // probe values green
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        ComputeQOIsGreenProbingLine();
+    }
+    // Update time integral values on rank 0
+    if( _rank == 0 ) {
+        _totalScalarOutflow += _curScalarOutflow * _dT;
+        _totalScalarOutflowPeri1 += _curScalarOutflowPeri1 * _dT;
+        _totalScalarOutflowPeri2 += _curScalarOutflowPeri2 * _dT;
+        _totalAbsorptionLattice += _curAbsorptionLattice * _dT;
+
+        _totalAbsorptionHohlraumCenter += _curAbsorptionHohlraumCenter * _dT;
+        _totalAbsorptionHohlraumVertical += _curAbsorptionHohlraumVertical * _dT;
+        _totalAbsorptionHohlraumHorizontal += _curAbsorptionHohlraumHorizontal * _dT;
+
+        _rmsFlux = sqrt( _rmsFlux );
+    }
+}
+
+bool SNSolverHPCCUDA::IsAbsorptionLattice( double x, double y ) const {
+    // Check whether pos is inside absorbing squares
+    double xy_corrector = -3.5;
+    std::vector<double> lbounds{ 1 + xy_corrector, 2 + xy_corrector, 3 + xy_corrector, 4 + xy_corrector, 5 + xy_corrector };
+    std::vector<double> ubounds{ 2 + xy_corrector, 3 + xy_corrector, 4 + xy_corrector, 5 + xy_corrector, 6 + xy_corrector };
+    for( unsigned k = 0; k < lbounds.size(); ++k ) {
+        for( unsigned l = 0; l < lbounds.size(); ++l ) {
+            if( ( l + k ) % 2 == 1 || ( k == 2 && l == 2 ) || ( k == 2 && l == 4 ) ) continue;
+            if( x >= lbounds[k] && x <= ubounds[k] && y >= lbounds[l] && y <= ubounds[l] ) {
+                return true;
+            }
+        }
+    }
+    return false;
+}
+
+// --- Helper ---
+double SNSolverHPCCUDA::ComputeTimeStep( double cfl ) const {
+    // for pseudo 1D, set timestep to dx
+    double dx, dy;
+    switch( _settings->GetProblemName() ) {
+        case PROBLEM_Checkerboard1D:
+            dx = 7.0 / (double)_nCells;
+            dy = 0.3;
+            return cfl * ( dx * dy ) / ( dx + dy );
+            break;
+        case PROBLEM_Linesource1D:     // Fallthrough
+        case PROBLEM_Meltingcube1D:    // Fallthrough
+        case PROBLEM_Aircavity1D:
+            dx = 3.0 / (double)_nCells;
+            dy = 0.3;
+            return cfl * ( dx * dy ) / ( dx + dy );
+            break;
+        default: break;    // 2d as normal
+    }
+    // 2D case
+    double charSize = __DBL_MAX__;    // minimum char size of all mesh cells in the mesh
+
+#pragma omp parallel for reduction( min : charSize )
+    for( unsigned long j = 0; j < _nCells; j++ ) {
+        double currCharSize = sqrt( _areas[j] );
+        if( currCharSize < charSize ) {
+            charSize = currCharSize;
+        }
+    }
+    if( _rank == 0 ) {
+        auto log         = spdlog::get( "event" );
+        std::string line = "| Smallest characteristic length of a grid cell in this mesh: " + std::to_string( charSize );
+        log->info( line );
+        line = "| Corresponding maximal time-step: " + std::to_string( cfl * charSize );
+        log->info( line );
+    }
+    return cfl * charSize;
+}
+
+// --- IO ----
+void SNSolverHPCCUDA::PrepareScreenOutput() {
+    unsigned nFields = (unsigned)_settings->GetNScreenOutput();
+
+    _screenOutputFieldNames.resize( nFields );
+    _screenOutputFields.resize( nFields );
+
+    // Prepare all output Fields ==> Specified in option SCREEN_OUTPUT
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetScreenOutput()[idx_field] ) {
+            case MASS: _screenOutputFieldNames[idx_field] = "Mass"; break;
+            case ITER: _screenOutputFieldNames[idx_field] = "Iter"; break;
+            case SIM_TIME: _screenOutputFieldNames[idx_field] = "Sim time"; break;
+            case WALL_TIME: _screenOutputFieldNames[idx_field] = "Wall time [s]"; break;
+            case RMS_FLUX: _screenOutputFieldNames[idx_field] = "RMS flux"; break;
+            case VTK_OUTPUT: _screenOutputFieldNames[idx_field] = "VTK out"; break;
+            case CSV_OUTPUT: _screenOutputFieldNames[idx_field] = "CSV out"; break;
+            case CUR_OUTFLOW: _screenOutputFieldNames[idx_field] = "Cur. outflow"; break;
+            case TOTAL_OUTFLOW: _screenOutputFieldNames[idx_field] = "Tot. outflow"; break;
+            case CUR_OUTFLOW_P1: _screenOutputFieldNames[idx_field] = "Cur. outflow P1"; break;
+            case TOTAL_OUTFLOW_P1: _screenOutputFieldNames[idx_field] = "Tot. outflow P1"; break;
+            case CUR_OUTFLOW_P2: _screenOutputFieldNames[idx_field] = "Cur. outflow P2"; break;
+            case TOTAL_OUTFLOW_P2: _screenOutputFieldNames[idx_field] = "Tot. outflow P2"; break;
+            case MAX_OUTFLOW: _screenOutputFieldNames[idx_field] = "Max outflow"; break;
+            case CUR_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Cur. absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Tot. absorption"; break;
+            case MAX_PARTICLE_ABSORPTION: _screenOutputFieldNames[idx_field] = "Max absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _screenOutputFieldNames[idx_field] = "Tot. abs. center"; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _screenOutputFieldNames[idx_field] = "Tot. abs. vertical wall"; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _screenOutputFieldNames[idx_field] = "Tot. abs. horizontal wall"; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                _screenOutputFieldNames[idx_field] = "Probe 1 u_0";
+                idx_field++;
+                _screenOutputFieldNames[idx_field] = "Probe 2 u_0";
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+                    idx_field++;
+                    _screenOutputFieldNames[idx_field] = "Probe 3 u_0";
+                    idx_field++;
+                    _screenOutputFieldNames[idx_field] = "Probe 4 u_0";
+                }
+                break;
+            case VAR_ABSORPTION_GREEN: _screenOutputFieldNames[idx_field] = "Var. absorption green"; break;
+
+            default: ErrorMessages::Error( "Screen output field not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCCUDA::WriteScalarOutput( unsigned idx_iter ) {
+    unsigned n_probes = 4;
+
+    unsigned nFields                  = (unsigned)_settings->GetNScreenOutput();
+    const VectorVector probingMoments = _problem->GetCurrentProbeMoment();
+    // -- Screen Output
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+        // Different procedure, depending on the Group...
+        switch( _settings->GetScreenOutput()[idx_field] ) {
+            case MASS: _screenOutputFields[idx_field] = _mass; break;
+            case ITER: _screenOutputFields[idx_field] = idx_iter; break;
+            case SIM_TIME: _screenOutputFields[idx_field] = _curSimTime; break;
+            case WALL_TIME: _screenOutputFields[idx_field] = _currTime; break;
+            case RMS_FLUX: _screenOutputFields[idx_field] = _rmsFlux; break;
+            case VTK_OUTPUT:
+                _screenOutputFields[idx_field] = 0;
+                if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _screenOutputFields[idx_field] = 1;
+                }
+                break;
+            case CSV_OUTPUT:
+                _screenOutputFields[idx_field] = 0;
+                if( ( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _screenOutputFields[idx_field] = 1;
+                }
+                break;
+            case CUR_OUTFLOW: _screenOutputFields[idx_field] = _curScalarOutflow; break;
+            case TOTAL_OUTFLOW: _screenOutputFields[idx_field] = _totalScalarOutflow; break;
+            case CUR_OUTFLOW_P1: _screenOutputFields[idx_field] = _curScalarOutflowPeri1; break;
+            case TOTAL_OUTFLOW_P1: _screenOutputFields[idx_field] = _totalScalarOutflowPeri1; break;
+            case CUR_OUTFLOW_P2: _screenOutputFields[idx_field] = _curScalarOutflowPeri2; break;
+            case TOTAL_OUTFLOW_P2: _screenOutputFields[idx_field] = _totalScalarOutflowPeri2; break;
+            case MAX_OUTFLOW: _screenOutputFields[idx_field] = _curMaxOrdinateOutflow; break;
+            case CUR_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _curAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _totalAbsorptionLattice; break;
+            case MAX_PARTICLE_ABSORPTION: _screenOutputFields[idx_field] = _curMaxAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumCenter; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumVertical; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _screenOutputFields[idx_field] = _totalAbsorptionHohlraumHorizontal; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    _screenOutputFields[idx_field] = _probingMoments[Idx2D( i, 0, 3 )];
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _screenOutputFields[idx_field] = _varAbsorptionHohlraumGreen; break;
+            default: ErrorMessages::Error( "Screen output group not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+
+    // --- History output ---
+    nFields = (unsigned)_settings->GetNHistoryOutput();
+
+    std::vector<SCALAR_OUTPUT> screenOutputFields = _settings->GetScreenOutput();
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+
+        // Prepare all Output Fields per group
+        // Different procedure, depending on the Group...
+        switch( _settings->GetHistoryOutput()[idx_field] ) {
+            case MASS: _historyOutputFields[idx_field] = _mass; break;
+            case ITER: _historyOutputFields[idx_field] = idx_iter; break;
+            case SIM_TIME: _historyOutputFields[idx_field] = _curSimTime; break;
+            case WALL_TIME: _historyOutputFields[idx_field] = _currTime; break;
+            case RMS_FLUX: _historyOutputFields[idx_field] = _rmsFlux; break;
+            case VTK_OUTPUT:
+                _historyOutputFields[idx_field] = 0;
+                if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _historyOutputFields[idx_field] = 1;
+                }
+                break;
+
+            case CSV_OUTPUT:
+                _historyOutputFields[idx_field] = 0;
+                if( ( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) ||
+                    ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+                    _historyOutputFields[idx_field] = 1;
+                }
+                break;
+            case CUR_OUTFLOW: _historyOutputFields[idx_field] = _curScalarOutflow; break;
+            case TOTAL_OUTFLOW: _historyOutputFields[idx_field] = _totalScalarOutflow; break;
+            case CUR_OUTFLOW_P1: _historyOutputFields[idx_field] = _curScalarOutflowPeri1; break;
+            case TOTAL_OUTFLOW_P1: _historyOutputFields[idx_field] = _totalScalarOutflowPeri1; break;
+            case CUR_OUTFLOW_P2: _historyOutputFields[idx_field] = _curScalarOutflowPeri2; break;
+            case TOTAL_OUTFLOW_P2: _historyOutputFields[idx_field] = _totalScalarOutflowPeri2; break;
+            case MAX_OUTFLOW: _historyOutputFields[idx_field] = _curMaxOrdinateOutflow; break;
+            case CUR_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _curAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _totalAbsorptionLattice; break;
+            case MAX_PARTICLE_ABSORPTION: _historyOutputFields[idx_field] = _curMaxAbsorptionLattice; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumCenter; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumVertical; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _historyOutputFields[idx_field] = _totalAbsorptionHohlraumHorizontal; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    for( unsigned j = 0; j < 3; j++ ) {
+                        _historyOutputFields[idx_field] = _probingMoments[Idx2D( i, j, 3 )];
+                        idx_field++;
+                    }
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _historyOutputFields[idx_field] = _varAbsorptionHohlraumGreen; break;
+            case ABSORPTION_GREEN_LINE:
+                for( unsigned i = 0; i < _settings->GetNumProbingCellsLineHohlraum(); i++ ) {
+                    _historyOutputFields[idx_field] = _absorptionValsLineSegment[i];
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            case ABSORPTION_GREEN_BLOCK:
+                for( unsigned i = 0; i < 44; i++ ) {
+                    _historyOutputFields[idx_field] = _absorptionValsBlocksGreen[i];
+                    // std::cout << _absorptionValsBlocksGreen[i] << "/" << _historyOutputFields[idx_field] << std::endl;
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            default: ErrorMessages::Error( "History output group not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCCUDA::PrintScreenOutput( unsigned idx_iter ) {
+    auto log = spdlog::get( "event" );
+
+    unsigned strLen  = 15;    // max width of one column
+    char paddingChar = ' ';
+
+    // assemble the line to print
+    std::string lineToPrint = "| ";
+    std::string tmp;
+    for( unsigned idx_field = 0; idx_field < _settings->GetNScreenOutput(); idx_field++ ) {
+        tmp = std::to_string( _screenOutputFields[idx_field] );
+
+        // Format outputs correctly
+        std::vector<SCALAR_OUTPUT> integerFields    = { ITER };
+        std::vector<SCALAR_OUTPUT> scientificFields = { RMS_FLUX,
+                                                        MASS,
+                                                        WALL_TIME,
+                                                        CUR_OUTFLOW,
+                                                        TOTAL_OUTFLOW,
+                                                        CUR_OUTFLOW_P1,
+                                                        TOTAL_OUTFLOW_P1,
+                                                        CUR_OUTFLOW_P2,
+                                                        TOTAL_OUTFLOW_P2,
+                                                        MAX_OUTFLOW,
+                                                        CUR_PARTICLE_ABSORPTION,
+                                                        TOTAL_PARTICLE_ABSORPTION,
+                                                        MAX_PARTICLE_ABSORPTION,
+                                                        TOTAL_PARTICLE_ABSORPTION_CENTER,
+                                                        TOTAL_PARTICLE_ABSORPTION_VERTICAL,
+                                                        TOTAL_PARTICLE_ABSORPTION_HORIZONTAL,
+                                                        PROBE_MOMENT_TIME_TRACE,
+                                                        VAR_ABSORPTION_GREEN,
+                                                        ABSORPTION_GREEN_BLOCK,
+                                                        ABSORPTION_GREEN_LINE };
+        std::vector<SCALAR_OUTPUT> booleanFields    = { VTK_OUTPUT, CSV_OUTPUT };
+
+        if( !( integerFields.end() == std::find( integerFields.begin(), integerFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+            tmp = std::to_string( (int)_screenOutputFields[idx_field] );
+        }
+        else if( !( booleanFields.end() == std::find( booleanFields.begin(), booleanFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+            tmp = "no";
+            if( (bool)_screenOutputFields[idx_field] ) tmp = "yes";
+        }
+        else if( !( scientificFields.end() ==
+                    std::find( scientificFields.begin(), scientificFields.end(), _settings->GetScreenOutput()[idx_field] ) ) ) {
+
+            std::stringstream ss;
+            ss << TextProcessingToolbox::DoubleToScientificNotation( _screenOutputFields[idx_field] );
+            tmp = ss.str();
+            tmp.erase( std::remove( tmp.begin(), tmp.end(), '+' ), tmp.end() );    // removing the '+' sign
+        }
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+    }
+    if( _settings->GetScreenOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetScreenOutputFrequency() == 0 ) {
+        log->info( lineToPrint );
+    }
+    else if( idx_iter == _nIter - 1 ) {    // Always print last iteration
+        log->info( lineToPrint );
+    }
+}
+
+void SNSolverHPCCUDA::PrepareHistoryOutput() {
+    unsigned n_probes = 4;
+
+    unsigned nFields = (unsigned)_settings->GetNHistoryOutput();
+
+    _historyOutputFieldNames.resize( nFields );
+    _historyOutputFields.resize( nFields );
+
+    // Prepare all output Fields ==> Specified in option SCREEN_OUTPUT
+    for( unsigned idx_field = 0; idx_field < nFields; idx_field++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetHistoryOutput()[idx_field] ) {
+            case MASS: _historyOutputFieldNames[idx_field] = "Mass"; break;
+            case ITER: _historyOutputFieldNames[idx_field] = "Iter"; break;
+            case SIM_TIME: _historyOutputFieldNames[idx_field] = "Sim_time"; break;
+            case WALL_TIME: _historyOutputFieldNames[idx_field] = "Wall_time_[s]"; break;
+            case RMS_FLUX: _historyOutputFieldNames[idx_field] = "RMS_flux"; break;
+            case VTK_OUTPUT: _historyOutputFieldNames[idx_field] = "VTK_out"; break;
+            case CSV_OUTPUT: _historyOutputFieldNames[idx_field] = "CSV_out"; break;
+            case CUR_OUTFLOW: _historyOutputFieldNames[idx_field] = "Cur_outflow"; break;
+            case TOTAL_OUTFLOW: _historyOutputFieldNames[idx_field] = "Total_outflow"; break;
+            case CUR_OUTFLOW_P1: _historyOutputFieldNames[idx_field] = "Cur_outflow_P1"; break;
+            case TOTAL_OUTFLOW_P1: _historyOutputFieldNames[idx_field] = "Total_outflow_P1"; break;
+            case CUR_OUTFLOW_P2: _historyOutputFieldNames[idx_field] = "Cur_outflow_P2"; break;
+            case TOTAL_OUTFLOW_P2: _historyOutputFieldNames[idx_field] = "Total_outflow_P2"; break;
+            case MAX_OUTFLOW: _historyOutputFieldNames[idx_field] = "Max_outflow"; break;
+            case CUR_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Cur_absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Total_absorption"; break;
+            case MAX_PARTICLE_ABSORPTION: _historyOutputFieldNames[idx_field] = "Max_absorption"; break;
+            case TOTAL_PARTICLE_ABSORPTION_CENTER: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_center"; break;
+            case TOTAL_PARTICLE_ABSORPTION_VERTICAL: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_vertical_wall"; break;
+            case TOTAL_PARTICLE_ABSORPTION_HORIZONTAL: _historyOutputFieldNames[idx_field] = "Cumulated_absorption_horizontal_wall"; break;
+            case PROBE_MOMENT_TIME_TRACE:
+                if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) n_probes = 4;
+                // if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) n_probes = 2;
+                for( unsigned i = 0; i < n_probes; i++ ) {
+                    for( unsigned j = 0; j < 3; j++ ) {
+                        _historyOutputFieldNames[idx_field] = "Probe " + std::to_string( i ) + " u_" + std::to_string( j );
+                        idx_field++;
+                    }
+                }
+                idx_field--;
+                break;
+            case VAR_ABSORPTION_GREEN: _historyOutputFieldNames[idx_field] = "Var. absorption green"; break;
+            case ABSORPTION_GREEN_BLOCK:
+                for( unsigned i = 0; i < 44; i++ ) {
+                    _historyOutputFieldNames[idx_field] = "Probe Green Block " + std::to_string( i );
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+
+            case ABSORPTION_GREEN_LINE:
+                for( unsigned i = 0; i < _settings->GetNumProbingCellsLineHohlraum(); i++ ) {
+                    _historyOutputFieldNames[idx_field] = "Probe Green Line " + std::to_string( i );
+                    idx_field++;
+                }
+                idx_field--;
+                break;
+            default: ErrorMessages::Error( "History output field not defined!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCCUDA::PrintHistoryOutput( unsigned idx_iter ) {
+
+    auto log = spdlog::get( "tabular" );
+
+    // assemble the line to print
+    std::string lineToPrint = "";
+    std::string tmp;
+    for( int idx_field = 0; idx_field < _settings->GetNHistoryOutput() - 1; idx_field++ ) {
+        if( idx_field == 0 ) {
+            tmp = std::to_string( _historyOutputFields[idx_field] );    // Iteration count
+        }
+        else {
+            tmp = TextProcessingToolbox::DoubleToScientificNotation( _historyOutputFields[idx_field] );
+        }
+        lineToPrint += tmp + ",";
+    }
+    tmp = TextProcessingToolbox::DoubleToScientificNotation( _historyOutputFields[_settings->GetNHistoryOutput() - 1] );
+    lineToPrint += tmp;    // Last element without comma
+    // std::cout << lineToPrint << std::endl;
+    if( _settings->GetHistoryOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetHistoryOutputFrequency() == 0 ) {
+        log->info( lineToPrint );
+    }
+    else if( idx_iter == _nIter - 1 ) {    // Always print last iteration
+        log->info( lineToPrint );
+    }
+}
+
+void SNSolverHPCCUDA::DrawPreSolverOutput() {
+
+    // Logger
+    auto log    = spdlog::get( "event" );
+    auto logCSV = spdlog::get( "tabular" );
+
+    std::string hLine = "--";
+
+    unsigned strLen  = 15;    // max width of one column
+    char paddingChar = ' ';
+
+    // Assemble Header for Screen Output
+    std::string lineToPrint = "| ";
+    std::string tmpLine     = "-----------------";
+    for( unsigned idxFields = 0; idxFields < _settings->GetNScreenOutput(); idxFields++ ) {
+        std::string tmp = _screenOutputFieldNames[idxFields];
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+        hLine += tmpLine;
+    }
+    log->info( "---------------------------- Solver Starts -----------------------------" );
+    log->info( "| The simulation will run for {} iterations.", _nIter );
+    log->info( "| The spatial grid contains {} cells.", _nCells );
+    if( _settings->GetSolverName() != PN_SOLVER && _settings->GetSolverName() != CSD_PN_SOLVER ) {
+        log->info( "| The velocity grid contains {} points.", _nq );
+    }
+    log->info( hLine );
+    log->info( lineToPrint );
+    log->info( hLine );
+
+    std::string lineToPrintCSV = "";
+    for( int idxFields = 0; idxFields < _settings->GetNHistoryOutput() - 1; idxFields++ ) {
+        std::string tmp = _historyOutputFieldNames[idxFields];
+        lineToPrintCSV += tmp + ",";
+    }
+    lineToPrintCSV += _historyOutputFieldNames[_settings->GetNHistoryOutput() - 1];
+    logCSV->info( lineToPrintCSV );
+}
+
+void SNSolverHPCCUDA::DrawPostSolverOutput() {
+
+    // Logger
+    auto log = spdlog::get( "event" );
+
+    std::string hLine = "--";
+
+    unsigned strLen  = 10;    // max width of one column
+    char paddingChar = ' ';
+
+    // Assemble Header for Screen Output
+    std::string lineToPrint = "| ";
+    std::string tmpLine     = "------------";
+    for( unsigned idxFields = 0; idxFields < _settings->GetNScreenOutput(); idxFields++ ) {
+        std::string tmp = _screenOutputFieldNames[idxFields];
+
+        if( strLen > tmp.size() )    // Padding
+            tmp.insert( 0, strLen - tmp.size(), paddingChar );
+        else if( strLen < tmp.size() )    // Cutting
+            tmp.resize( strLen );
+
+        lineToPrint += tmp + " |";
+        hLine += tmpLine;
+    }
+    log->info( hLine );
+#ifndef BUILD_TESTING
+    log->info( "| The volume output files have been stored at " + _settings->GetOutputFile() );
+    log->info( "| The log files have been stored at " + _settings->GetLogDir() + _settings->GetLogFile() );
+#endif
+    log->info( "--------------------------- Solver Finished ----------------------------" );
+}
+
+unsigned long SNSolverHPCCUDA::Idx2D( unsigned long idx1, unsigned long idx2, unsigned long len2 ) { return idx1 * len2 + idx2; }
+
+unsigned long SNSolverHPCCUDA::Idx3D( unsigned long idx1, unsigned long idx2, unsigned long idx3, unsigned long len2, unsigned long len3 ) {
+    return ( idx1 * len2 + idx2 ) * len3 + idx3;
+}
+
+void SNSolverHPCCUDA::WriteVolumeOutput( unsigned idx_iter ) {
+    unsigned nGroups = (unsigned)_settings->GetNVolumeOutput();
+    if( ( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (unsigned)_settings->GetVolumeOutputFrequency() == 0 ) ||
+        ( idx_iter == _nIter - 1 ) /* need sol at last iteration */ ) {
+        for( unsigned idx_group = 0; idx_group < nGroups; idx_group++ ) {
+            switch( _settings->GetVolumeOutput()[idx_group] ) {
+                case MINIMAL:
+                    if( _rank == 0 ) {
+                        _outputFields[idx_group][0] = _scalarFlux;
+                    }
+                    break;
+
+                case MOMENTS:
+#pragma omp parallel for
+                    for( unsigned idx_cell = 0; idx_cell < _nCells; ++idx_cell ) {
+                        _outputFields[idx_group][0][idx_cell] = 0.0;
+                        _outputFields[idx_group][1][idx_cell] = 0.0;
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            _outputFields[idx_group][0][idx_cell] +=
+                                _quadPts[Idx2D( idx_sys, 0, _nDim )] * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+                            _outputFields[idx_group][1][idx_cell] +=
+                                _quadPts[Idx2D( idx_sys, 1, _nDim )] * _sol[Idx2D( idx_cell, idx_sys, _localNSys )] * _quadWeights[idx_sys];
+                        }
+                    }
+#ifdef IMPORT_MPI
+                    MPI_Barrier( MPI_COMM_WORLD );
+                    MPI_Allreduce(
+                        _outputFields[idx_group][0].data(), _outputFields[idx_group][0].data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                    MPI_Allreduce(
+                        _outputFields[idx_group][1].data(), _outputFields[idx_group][1].data(), _nCells, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
+                    MPI_Barrier( MPI_COMM_WORLD );
+#endif
+                    break;
+                default: ErrorMessages::Error( "Volume Output Group not defined for HPC SN Solver!", CURRENT_FUNCTION ); break;
+            }
+        }
+    }
+}
+
+void SNSolverHPCCUDA::PrintVolumeOutput( int idx_iter ) {
+    if( _settings->GetSaveRestartSolutionFrequency() != 0 && idx_iter % (int)_settings->GetSaveRestartSolutionFrequency() == 0 ) {
+        // std::cout << "Saving restart solution at iteration " << idx_iter << std::endl;
+        WriteRestartSolution( _settings->GetOutputFile(),
+                              _sol,
+                              _scalarFlux,
+                              _rank,
+                              idx_iter,
+                              _totalAbsorptionHohlraumCenter,
+                              _totalAbsorptionHohlraumVertical,
+                              _totalAbsorptionHohlraumHorizontal,
+                              _totalAbsorptionLattice );
+    }
+
+    if( _settings->GetVolumeOutputFrequency() != 0 && idx_iter % (int)_settings->GetVolumeOutputFrequency() == 0 ) {
+        WriteVolumeOutput( idx_iter );
+        if( _rank == 0 ) {
+            ExportVTK( _settings->GetOutputFile() + "_" + std::to_string( idx_iter ), _outputFields, _outputFieldNames, _mesh );    // slow
+        }
+    }
+    if( idx_iter == (int)_nIter - 1 ) {    // Last iteration write without suffix.
+        WriteVolumeOutput( idx_iter );
+        if( _rank == 0 ) {
+            ExportVTK( _settings->GetOutputFile(), _outputFields, _outputFieldNames, _mesh );
+        }
+    }
+}
+
+void SNSolverHPCCUDA::PrepareVolumeOutput() {
+    unsigned nGroups = (unsigned)_settings->GetNVolumeOutput();
+
+    _outputFieldNames.resize( nGroups );
+    _outputFields.resize( nGroups );
+
+    // Prepare all OutputGroups ==> Specified in option VOLUME_OUTPUT
+    for( unsigned idx_group = 0; idx_group < nGroups; idx_group++ ) {
+        // Prepare all Output Fields per group
+
+        // Different procedure, depending on the Group...
+        switch( _settings->GetVolumeOutput()[idx_group] ) {
+            case MINIMAL:
+                // Currently only one entry ==> rad flux
+                _outputFields[idx_group].resize( 1 );
+                _outputFieldNames[idx_group].resize( 1 );
+
+                _outputFields[idx_group][0].resize( _nCells );
+                _outputFieldNames[idx_group][0] = "scalar flux";
+                break;
+            case MOMENTS:
+                // As many entries as there are moments in the system
+                _outputFields[idx_group].resize( _nOutputMoments );
+                _outputFieldNames[idx_group].resize( _nOutputMoments );
+
+                for( unsigned idx_moment = 0; idx_moment < _nOutputMoments; idx_moment++ ) {
+                    _outputFieldNames[idx_group][idx_moment] = std::string( "u_" + std::to_string( idx_moment ) );
+                    _outputFields[idx_group][idx_moment].resize( _nCells );
+                }
+                break;
+
+            default: ErrorMessages::Error( "Volume Output Group not defined for HPC SN Solver!", CURRENT_FUNCTION ); break;
+        }
+    }
+}
+
+void SNSolverHPCCUDA::SetGhostCells() {
+    if( _settings->GetProblemName() == PROBLEM_Lattice ) {
+        // #pragma omp parallel for
+        for( unsigned idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+            if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN || _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::DIRICHLET ) {
+                _ghostCells[idx_cell] = std::vector<double>( _localNSys, 0.0 );
+            }
+        }
+    }
+    else if( _settings->GetProblemName() == PROBLEM_HalfLattice ) {    // HALF LATTICE NOT WORKING
+        ErrorMessages::Error( "Test case does not work with MPI", CURRENT_FUNCTION );
+    }
+    else if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+
+        auto nodes = _mesh->GetNodes();
+        double tol = 1e-12;    // For distance to boundary
+
+        // #pragma omp parallel for
+        for( unsigned idx_cell = 0; idx_cell < _nCells; idx_cell++ ) {
+
+            if( _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::NEUMANN || _cellBoundaryTypes[idx_cell] == BOUNDARY_TYPE::DIRICHLET ) {
+                _ghostCells[idx_cell] = std::vector<double>( _localNSys, 0.0 );
+
+                auto localCellNodes = _mesh->GetCells()[idx_cell];
+
+                for( unsigned idx_node = 0; idx_node < _mesh->GetNumNodesPerCell(); idx_node++ ) {    // Check if corner node is in this cell
+                    if( nodes[localCellNodes[idx_node]][0] < -0.65 + tol ) {                          // close to 0 => left boundary
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            if( _quadPts[Idx2D( idx_sys, 0, _nDim )] > 0.0 ) _ghostCells[idx_cell][idx_sys] = 1.0;
+                        }
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][0] > 0.65 - tol ) {    // right boundary
+                        for( unsigned idx_sys = 0; idx_sys < _localNSys; idx_sys++ ) {
+                            if( _quadPts[Idx2D( idx_sys, 0, _nDim )] < 0.0 ) _ghostCells[idx_cell][idx_sys] = 1.0;
+                        }
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][1] < -0.65 + tol ) {    // lower boundary
+                        break;
+                    }
+                    else if( nodes[localCellNodes[idx_node]][1] > 0.65 - tol ) {    // upper boundary
+                        break;
+                    }
+                    else if( idx_node == _mesh->GetNumNodesPerCell() - 1 ) {
+                        ErrorMessages::Error( " Problem with ghost cell setup and  boundary of this mesh ", CURRENT_FUNCTION );
+                    }
+                }
+            }
+        }
+    }
+    else if( _settings->GetProblemName() == PROBLEM_QuarterHohlraum ) {
+        ErrorMessages::Error( "Test case does not work with MPI", CURRENT_FUNCTION );
+    }
+}
+
+void SNSolverHPCCUDA::SetProbingCellsLineGreen() {
+
+    if( _settings->GetProblemName() == PROBLEM_SymmetricHohlraum ) {
+        assert( _nProbingCellsLineGreen % 2 == 0 );
+
+        std::vector<double> p1 = { _cornerUpperLeftGreen[0] + _thicknessGreen / 2.0, _cornerUpperLeftGreen[1] - _thicknessGreen / 2.0 };
+        std::vector<double> p2 = { _cornerUpperRightGreen[0] - _thicknessGreen / 2.0, _cornerUpperRightGreen[1] - _thicknessGreen / 2.0 };
+        std::vector<double> p3 = { _cornerLowerRightGreen[0] - _thicknessGreen / 2.0, _cornerLowerRightGreen[1] + _thicknessGreen / 2.0 };
+        std::vector<double> p4 = { _cornerLowerLeftGreen[0] + _thicknessGreen / 2.0, _cornerLowerLeftGreen[1] + _thicknessGreen / 2.0 };
+
+        double verticalLineWidth   = std::abs( p1[1] - p4[1] );
+        double horizontalLineWidth = std::abs( p1[0] - p2[0] );
+
+        double pt_ratio_h = horizontalLineWidth / ( horizontalLineWidth + verticalLineWidth );
+
+        unsigned nHorizontalProbingCells = (unsigned)std::ceil( _nProbingCellsLineGreen / 2 * pt_ratio_h );
+        unsigned nVerticalProbingCells   = _nProbingCellsLineGreen / 2 - nHorizontalProbingCells;
+        assert( nHorizontalProbingCells > 1 );
+        assert( nVerticalProbingCells > 1 );
+
+        _probingCellsLineGreen = std::vector<unsigned>( 2 * ( nVerticalProbingCells + nHorizontalProbingCells ) );
+
+        // Sample points on each side of the rectangle
+        std::vector<unsigned> side1 = linspace2D( p1, p2, nHorizontalProbingCells );    // upper horizontal
+        std::vector<unsigned> side2 = linspace2D( p2, p3, nVerticalProbingCells );      // right vertical
+        std::vector<unsigned> side3 = linspace2D( p3, p4, nHorizontalProbingCells );    // lower horizontal
+        std::vector<unsigned> side4 = linspace2D( p4, p1, nVerticalProbingCells );      // left vertical
+
+        for( unsigned i = 0; i < nHorizontalProbingCells; ++i ) {
+            _probingCellsLineGreen[i] = side1[i];
+        }
+        for( unsigned i = 0; i < nVerticalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nHorizontalProbingCells] = side2[i];
+        }
+        for( unsigned i = 0; i < nHorizontalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nVerticalProbingCells + nHorizontalProbingCells] = side3[i];
+        }
+        for( unsigned i = 0; i < nVerticalProbingCells; ++i ) {
+            _probingCellsLineGreen[i + nVerticalProbingCells + 2 * nHorizontalProbingCells] = side4[i];
+        }
+
+        // Block-wise approach
+        // Initialize the vector to store the corner points of each block
+        std::vector<std::vector<std::vector<double>>> block_corners;
+
+        double block_size = 0.05;
+        const double cx   = _centerGreen[0];
+        const double cy   = _centerGreen[1];
+
+        // Loop to fill the blocks
+        for( int i = 0; i < 8; ++i ) {    // 8 blocks in the x-direction (horizontal) (upper) (left to right)
+
+            // Top row
+            double x1 = -0.2 + cx + i * block_size;
+            double y1 = 0.4 + cy;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+            block_corners.push_back( corners );
+        }
+
+        for( int j = 0; j < 14; ++j ) {    // 14 blocks in the y-direction (vertical)
+            // right column double x1 = 0.15;
+            double x1 = 0.15 + cx;
+            double y1 = 0.35 + cy - j * block_size;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            // Store the four corner points for this block
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+
+            block_corners.push_back( corners );
+        }
+
+        for( int i = 0; i < 8; ++i ) {    // 8 blocks in the x-direction (horizontal) (lower) (right to left)
+            // bottom row
+            double x1 = 0.15 + cx - i * block_size;
+            double y1 = -0.35 + cy;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+            block_corners.push_back( corners );
+        }
+
+        for( int j = 0; j < 14; ++j ) {    // 14 blocks in the y-direction (vertical) (down to up)
+
+            // left column
+            double x1 = -0.2 + cx;
+            double y1 = -0.3 + cy + j * block_size;
+            double x2 = x1 + block_size;
+            double y2 = y1 - block_size;
+
+            // Store the four corner points for this block
+            std::vector<std::vector<double>> corners = {
+                { x1, y1 },    // top-left
+                { x2, y1 },    // top-right
+                { x2, y2 },    // bottom-right
+                { x1, y2 }     // bottom-left
+            };
+
+            block_corners.push_back( corners );
+        }
+
+        // Compute the probing cells for each block
+        for( unsigned i = 0; i < _nProbingCellsBlocksGreen; i++ ) {
+            _probingCellsBlocksGreen.push_back( _mesh->GetCellsofRectangle( block_corners[i] ) );
+        }
+    }
+}
+
+void SNSolverHPCCUDA::ComputeQOIsGreenProbingLine() {
+#pragma omp parallel for
+    for( unsigned i = 0; i < _nProbingCellsLineGreen; i++ ) {    // Loop over probing cells
+        _absorptionValsLineSegment[i] =
+            ( _sigmaT[_probingCellsLineGreen[i]] - _sigmaS[_probingCellsLineGreen[i]] ) * _scalarFlux[_probingCellsLineGreen[i]];
+    }
+
+    // Block-wise approach
+    // #pragma omp parallel for
+    for( unsigned i = 0; i < _nProbingCellsBlocksGreen; i++ ) {
+        _absorptionValsBlocksGreen[i] = 0.0;
+        for( unsigned j = 0; j < _probingCellsBlocksGreen[i].size(); j++ ) {
+            _absorptionValsBlocksGreen[i] += ( _sigmaT[_probingCellsBlocksGreen[i][j]] - _sigmaS[_probingCellsBlocksGreen[i][j]] ) *
+                                             _scalarFlux[_probingCellsBlocksGreen[i][j]] * _areas[_probingCellsBlocksGreen[i][j]];
+        }
+    }
+    // std::cout << _absorptionValsBlocksGreen[1] << std::endl;
+    // std::cout << _absorptionValsLineSegment[1] << std::endl;
+}
+
+std::vector<unsigned> SNSolverHPCCUDA::linspace2D( const std::vector<double>& start, const std::vector<double>& end, unsigned num_points ) {
+    /**
+     * Generate a 2D linspace based on the start and end points with a specified number of points.
+     *
+     * @param start vector of starting x and y coordinates
+     * @param end vector of ending x and y coordinates
+     * @param num_points number of points to generate
+     *
+     * @return vector of unsigned integers representing the result
+     */
+
+    std::vector<unsigned> result;
+    assert( num_points > 1 );
+    result.resize( num_points );
+    double stepX = ( end[0] - start[0] ) / ( num_points - 1 );
+    double stepY = ( end[1] - start[1] ) / ( num_points - 1 );
+#pragma omp parallel for
+    for( unsigned i = 0; i < num_points; ++i ) {
+        double x = start[0] + i * stepX;
+        double y = start[1] + i * stepY;
+
+        result[i] = _mesh->GetCellOfKoordinate( x, y );
+    }
+
+    return result;
+}
+
+void SNSolverHPCCUDA::ComputeCellsPerimeterLattice() {
+    double l_1    = 1.5;    // perimeter 1
+    double l_2    = 2.5;    // perimeter 2
+    auto nodes    = _mesh->GetNodes();
+    auto cells    = _mesh->GetCells();
+    auto cellMids = _mesh->GetCellMidPoints();
+    auto normals  = _mesh->GetNormals();
+    auto neigbors = _mesh->GetNeighbours();
+
+    _isPerimeterLatticeCell1.resize( _mesh->GetNumCells(), false );
+    _isPerimeterLatticeCell2.resize( _mesh->GetNumCells(), false );
+
+    for( unsigned idx_cell = 0; idx_cell < _mesh->GetNumCells(); ++idx_cell ) {
+        if( abs( cellMids[idx_cell][0] ) < l_1 && abs( cellMids[idx_cell][1] ) < l_1 ) {
+            // Cell is within perimeter
+            for( unsigned idx_nbr = 0; idx_nbr < _mesh->GetNumNodesPerCell(); ++idx_nbr ) {
+                if( neigbors[idx_cell][idx_nbr] == _mesh->GetNumCells() ) {
+                    continue;    // Skip boundary - ghost cells
+                }
+
+                if( ( abs( cellMids[neigbors[idx_cell][idx_nbr]][0] ) > l_1 && abs( cellMids[idx_cell][0] ) < l_1 ) ||
+                    ( abs( cellMids[neigbors[idx_cell][idx_nbr]][1] ) > l_1 && abs( cellMids[idx_cell][1] ) < l_1 ) ) {
+                    // neighbor is outside perimeter
+                    _cellsLatticePerimeter1[idx_cell].push_back( idx_nbr );
+                    _isPerimeterLatticeCell1[idx_cell] = true;
+                }
+            }
+        }
+        else if( abs( cellMids[idx_cell][0] ) < l_2 && abs( cellMids[idx_cell][1] ) < l_2 ) {
+            // Cell is within perimeter
+            for( unsigned idx_nbr = 0; idx_nbr < _mesh->GetNumNodesPerCell(); ++idx_nbr ) {
+                if( neigbors[idx_cell][idx_nbr] == _mesh->GetNumCells() ) {
+                    continue;    // Skip boundary - ghost cells
+                }
+                if( ( abs( cellMids[neigbors[idx_cell][idx_nbr]][0] ) > l_2 && abs( cellMids[idx_cell][0] ) < l_2 ) ||
+                    ( abs( cellMids[neigbors[idx_cell][idx_nbr]][1] ) > l_2 && abs( cellMids[idx_cell][1] ) < l_2 ) ) {
+                    // neighbor is outside perimeter
+                    _cellsLatticePerimeter2[idx_cell].push_back( idx_nbr );
+                    _isPerimeterLatticeCell2[idx_cell] = true;
+                }
+            }
+        }
+    }
+}
diff --git a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order1.cfg b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order1.cfg
new file mode 100644
index 00000000..afc45aae
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order1.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Lattice Validation SN-HPC           %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = lattice_hpc_200_cpu_order1
+LOG_DIR = ../../../result/logs
+LOG_FILE = lattice_hpc_200_cpu_order1_output
+MESH_FILE = lattice_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = LATTICE
+SPATIAL_DIM = 2
+SOURCE_MAGNITUDE = 1.0
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 1
+TIME_INTEGRATION_ORDER = 1
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 4
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, CUR_OUTFLOW_P1, TOTAL_OUTFLOW_P1, CUR_OUTFLOW_P2, TOTAL_OUTFLOW_P2, MAX_OUTFLOW, CUR_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION, MAX_PARTICLE_ABSORPTION)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
new file mode 100644
index 00000000..ee133cc7
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cpu_order2.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Lattice Validation SN-HPC           %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = lattice_hpc_200_cpu_order2
+LOG_DIR = ../../../result/logs
+LOG_FILE = lattice_hpc_200_cpu_order2_output
+MESH_FILE = lattice_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = LATTICE
+SPATIAL_DIM = 2
+SOURCE_MAGNITUDE = 1.0
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 2
+TIME_INTEGRATION_ORDER = 2
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 4
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, CUR_OUTFLOW_P1, TOTAL_OUTFLOW_P1, CUR_OUTFLOW_P2, TOTAL_OUTFLOW_P2, MAX_OUTFLOW, CUR_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION, MAX_PARTICLE_ABSORPTION)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_csv_reference b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_csv_reference
index 140a7be3..b5268640 100644
--- a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_csv_reference
+++ b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_csv_reference
@@ -1,4 +1,4 @@
-2026-02-10 15:35:39.595600 ,Iter,Sim_time,Wall_time_[s],Mass,RMS_flux,VTK_out,CSV_out,Cur_outflow,Total_outflow,Cur_outflow_P1,Total_outflow_P1,Cur_outflow_P2,Total_outflow_P2,Max_outflow,Cur_absorption,Total_absorption,Max_absorption
-2026-02-11 22:02:50.261192 ,0.000000,2.474874E-01,1.064435E-01,6.382169E+00,4.127129E+00,0.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
-2026-02-11 22:02:50.375590 ,1.000000,4.949747E-01,2.208639E-01,9.623368E+00,3.464680E+00,0.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,3.459946E-01,8.562928E-02,0.000000E+00,0.000000E+00,0.000000E+00,1.638836E-01,4.055913E-02,2.562927E-02
-2026-02-11 22:02:50.483330 ,2.000000,5.000000E-01,3.285845E-01,6.535954E+00,6.883754E-02,1.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,3.570196E-01,8.742340E-02,0.000000E+00,0.000000E+00,0.000000E+00,1.715809E-01,4.142137E-02,2.679525E-02
+2026-02-11 20:35:43.785627 ,Iter,Sim_time,Wall_time_[s],Mass,RMS_flux,VTK_out,CSV_out,Cur_outflow,Total_outflow,Cur_outflow_P1,Total_outflow_P1,Cur_outflow_P2,Total_outflow_P2,Max_outflow,Cur_absorption,Total_absorption,Max_absorption
+2026-02-11 20:35:43.788487 ,0.000000,2.474874E-01,2.629354E-03,3.191085E+00,4.228409E+00,0.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
+2026-02-11 20:35:43.790886 ,1.000000,4.949747E-01,5.076625E-03,6.407226E+00,3.695820E+00,0.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,3.459946E-01,8.562928E-02,0.000000E+00,0.000000E+00,0.000000E+00,1.638836E-01,4.055913E-02,2.562927E-02
+2026-02-11 20:35:43.793245 ,2.000000,5.000000E-01,7.424587E-03,6.471590E+00,6.894449E-02,1.000000E+00,1.000000E+00,0.000000E+00,0.000000E+00,3.570196E-01,8.742340E-02,0.000000E+00,0.000000E+00,0.000000E+00,1.715809E-01,4.142137E-02,2.679525E-02
diff --git a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order1.cfg b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order1.cfg
new file mode 100644
index 00000000..c7d7c4db
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order1.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Lattice Validation SN-HPC           %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = lattice_hpc_200_cuda_order1
+LOG_DIR = ../../../result/logs
+LOG_FILE = lattice_hpc_200_cuda_order1_output
+MESH_FILE = lattice_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = LATTICE
+SPATIAL_DIM = 2
+SOURCE_MAGNITUDE = 1.0
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 1
+TIME_INTEGRATION_ORDER = 1
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 4
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, CUR_OUTFLOW_P1, TOTAL_OUTFLOW_P1, CUR_OUTFLOW_P2, TOTAL_OUTFLOW_P2, MAX_OUTFLOW, CUR_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION, MAX_PARTICLE_ABSORPTION)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
new file mode 100644
index 00000000..00ab7978
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/lattice_hpc_200_cuda_order2.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Lattice Validation SN-HPC           %
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = lattice_hpc_200_cuda_order2
+LOG_DIR = ../../../result/logs
+LOG_FILE = lattice_hpc_200_cuda_order2_output
+MESH_FILE = lattice_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = LATTICE
+SPATIAL_DIM = 2
+SOURCE_MAGNITUDE = 1.0
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 2
+TIME_INTEGRATION_ORDER = 2
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 4
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, CUR_OUTFLOW_P1, TOTAL_OUTFLOW_P1, CUR_OUTFLOW_P2, TOTAL_OUTFLOW_P2, MAX_OUTFLOW, CUR_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION, MAX_PARTICLE_ABSORPTION)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cpu_order2.cfg b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cpu_order2.cfg
new file mode 100644
index 00000000..3ddebc03
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cpu_order2.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Symmetric Hohlraum Validation SN-HPC%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = symmetric_hohlraum_hpc_200_cpu_order2
+LOG_DIR = ../../../result/logs
+LOG_FILE = symmetric_hohlraum_hpc_200_cpu_order2_output
+MESH_FILE = symmetric_hohlraum_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = SYMMETRIC_HOHLRAUM
+SPATIAL_DIM = 2
+N_SAMPLING_PTS_LINE_GREEN = 20
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 2
+TIME_INTEGRATION_ORDER = 2
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 6
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, MAX_OUTFLOW, TOTAL_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION_CENTER, TOTAL_PARTICLE_ABSORPTION_VERTICAL, TOTAL_PARTICLE_ABSORPTION_HORIZONTAL, PROBE_MOMENT_TIME_TRACE, VAR_ABSORPTION_GREEN, ABSORPTION_GREEN_BLOCK, ABSORPTION_GREEN_LINE)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_csv_reference b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_csv_reference
index 2db1b11c..c9a61628 100644
--- a/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_csv_reference
+++ b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_csv_reference
@@ -1,6 +1,6 @@
-2026-02-10 15:35:51.136388 ,Iter,Sim_time,Wall_time_[s],Mass,RMS_flux,VTK_out,CSV_out,Cur_outflow,Total_outflow,Max_outflow,Total_absorption,Cumulated_absorption_center,Cumulated_absorption_vertical_wall,Cumulated_absorption_horizontal_wall,Var. absorption green,Probe 0 u_0,Probe 0 u_1,Probe 0 u_2,Probe 1 u_0,Probe 1 u_1,Probe 1 u_2,Probe 2 u_0,Probe 2 u_1,Probe 2 u_2,Probe 3 u_0,Probe 3 u_1,Probe 3 u_2,Probe Green Line 0,Probe Green Line 1,Probe Green Line 2,Probe Green Line 3,Probe Green Line 4,Probe Green Line 5,Probe Green Line 6,Probe Green Line 7,Probe Green Line 8,Probe Green Line 9,Probe Green Line 10,Probe Green Line 11,Probe Green Line 12,Probe Green Line 13,Probe Green Line 14,Probe Green Line 15,Probe Green Line 16,Probe Green Line 17,Probe Green Line 18,Probe Green Line 19,Probe Green Block 0,Probe Green Block 1,Probe Green Block 2,Probe Green Block 3,Probe Green Block 4,Probe Green Block 5,Probe Green Block 6,Probe Green Block 7,Probe Green Block 8,Probe Green Block 9,Probe Green Block 10,Probe Green Block 11,Probe Green Block 12,Probe Green Block 13,Probe Green Block 14,Probe Green Block 15,Probe Green Block 16,Probe Green Block 17,Probe Green Block 18,Probe Green Block 19,Probe Green Block 20,Probe Green Block 21,Probe Green Block 22,Probe Green Block 23,Probe Green Block 24,Probe Green Block 25,Probe Green Block 26,Probe Green Block 27,Probe Green Block 28,Probe Green Block 29,Probe Green Block 30,Probe Green Block 31,Probe Green Block 32,Probe Green Block 33,Probe Green Block 34,Probe Green Block 35,Probe Green Block 36,Probe Green Block 37,Probe Green Block 38,Probe Green Block 39,Probe Green Block 40,Probe Green Block 41,Probe Green Block 42,Probe Green Block 43
-2026-02-11 21:52:38.908896 ,0.000000,4.596194E-02,1.315444E-01,3.912731E-01,3.889031E+00,0.000000E+00,1.000000E+00,1.699372E-01,7.810644E-03,2.896596E-01,4.720629E-02,0.000000E+00,0.000000E+00,4.720629E-02,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
-2026-02-11 21:52:39.043950 ,1.000000,9.192388E-02,2.665456E-01,5.997554E-01,3.915592E+00,0.000000E+00,1.000000E+00,3.818578E-01,2.536157E-02,6.095445E-01,1.556642E-01,0.000000E+00,0.000000E+00,1.556642E-01,0.000000E+00,3.054249E+00,2.369008E+00,-1.323735E-01,1.120689E+00,-9.105252E-01,2.929687E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
-2026-02-11 21:52:39.162921 ,2.000000,1.378858E-01,3.855422E-01,8.178787E-01,4.235208E+00,0.000000E+00,1.000000E+00,6.493960E-01,5.520907E-02,9.871098E-01,3.418355E-01,0.000000E+00,0.000000E+00,3.418355E-01,0.000000E+00,9.550106E+00,7.238872E+00,-5.483415E-02,4.380960E+00,-3.509899E+00,5.845488E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,8.166518E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
-2026-02-11 21:52:39.278844 ,3.000000,1.838478E-01,5.015391E-01,1.052166E+00,4.954813E+00,0.000000E+00,1.000000E+00,9.808722E-01,1.002919E-01,1.453090E+00,6.248197E-01,4.318221E-05,0.000000E+00,6.247765E-01,1.079086E-07,1.716434E+01,1.270100E+01,1.225169E-01,9.597497E+00,-7.507195E+00,7.118933E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,7.868604E-03,7.868604E-03,1.040100E-02,8.631371E-03,8.631648E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,7.868604E-03,7.868604E-03,1.040100E-02,8.631371E-03,8.631648E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,6.648971E-05,0.000000E+00,8.788844E-05,0.000000E+00,0.000000E+00,7.293509E-05,0.000000E+00,7.293743E-05,0.000000E+00,0.000000E+00,7.293742E-05,0.000000E+00,0.000000E+00,7.284801E-05,1.670795E-06,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,2.715306E-02,6.648971E-05,0.000000E+00,8.788844E-05,0.000000E+00,0.000000E+00,7.293509E-05,0.000000E+00,7.293743E-05,0.000000E+00,0.000000E+00,7.293740E-05,0.000000E+00,0.000000E+00,7.293113E-05
-2026-02-11 21:52:39.406889 ,4.000000,2.000000E-01,6.295428E-01,8.550401E-01,1.614070E+00,1.000000E+00,1.000000E+00,6.421574E-01,1.106641E-01,1.453090E+00,6.872688E-01,6.816741E-05,0.000000E+00,6.872006E-01,1.253516E-07,2.000858E+01,1.468820E+01,9.692551E-02,1.144289E+01,-8.866540E+00,6.303190E-01,9.269634E-04,8.323807E-04,-2.734572E-04,1.104766E-03,8.038164E-04,-5.334637E-04,0.000000E+00,2.793765E-03,3.405374E-03,7.363370E-03,7.363370E-03,1.036321E-02,8.481050E-03,8.486865E-03,0.000000E+00,0.000000E+00,0.000000E+00,2.798283E-03,3.405374E-03,7.363370E-03,7.363370E-03,1.036321E-02,8.481050E-03,8.486869E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,1.196639E-04,0.000000E+00,8.730187E-05,0.000000E+00,0.000000E+00,0.000000E+00,6.222047E-05,0.000000E+00,8.756911E-05,0.000000E+00,0.000000E+00,7.166487E-05,0.000000E+00,7.171401E-05,0.000000E+00,0.000000E+00,7.172550E-05,0.000000E+00,0.000000E+00,7.105263E-05,4.817435E-05,0.000000E+00,8.862358E-07,0.000000E+00,8.762648E-05,0.000000E+00,0.000000E+00,2.028076E-02,6.222047E-05,0.000000E+00,8.756911E-05,0.000000E+00,0.000000E+00,7.166487E-05,0.000000E+00,7.171404E-05,0.000000E+00,0.000000E+00,7.172476E-05,0.000000E+00,0.000000E+00,7.284716E-05
+2026-02-11 20:35:44.188132 ,Iter,Sim_time,Wall_time_[s],Mass,RMS_flux,VTK_out,CSV_out,Cur_outflow,Total_outflow,Max_outflow,Total_absorption,Cumulated_absorption_center,Cumulated_absorption_vertical_wall,Cumulated_absorption_horizontal_wall,Var. absorption green,Probe 0 u_0,Probe 0 u_1,Probe 0 u_2,Probe 1 u_0,Probe 1 u_1,Probe 1 u_2,Probe 2 u_0,Probe 2 u_1,Probe 2 u_2,Probe 3 u_0,Probe 3 u_1,Probe 3 u_2,Probe Green Line 0,Probe Green Line 1,Probe Green Line 2,Probe Green Line 3,Probe Green Line 4,Probe Green Line 5,Probe Green Line 6,Probe Green Line 7,Probe Green Line 8,Probe Green Line 9,Probe Green Line 10,Probe Green Line 11,Probe Green Line 12,Probe Green Line 13,Probe Green Line 14,Probe Green Line 15,Probe Green Line 16,Probe Green Line 17,Probe Green Line 18,Probe Green Line 19,Probe Green Block 0,Probe Green Block 1,Probe Green Block 2,Probe Green Block 3,Probe Green Block 4,Probe Green Block 5,Probe Green Block 6,Probe Green Block 7,Probe Green Block 8,Probe Green Block 9,Probe Green Block 10,Probe Green Block 11,Probe Green Block 12,Probe Green Block 13,Probe Green Block 14,Probe Green Block 15,Probe Green Block 16,Probe Green Block 17,Probe Green Block 18,Probe Green Block 19,Probe Green Block 20,Probe Green Block 21,Probe Green Block 22,Probe Green Block 23,Probe Green Block 24,Probe Green Block 25,Probe Green Block 26,Probe Green Block 27,Probe Green Block 28,Probe Green Block 29,Probe Green Block 30,Probe Green Block 31,Probe Green Block 32,Probe Green Block 33,Probe Green Block 34,Probe Green Block 35,Probe Green Block 36,Probe Green Block 37,Probe Green Block 38,Probe Green Block 39,Probe Green Block 40,Probe Green Block 41,Probe Green Block 42,Probe Green Block 43
+2026-02-11 20:35:44.192250 ,0.000000,4.596194E-02,3.171939E-03,1.956365E-01,3.920533E+00,0.000000E+00,1.000000E+00,1.699372E-01,7.810644E-03,2.896596E-01,4.720629E-02,0.000000E+00,0.000000E+00,4.720629E-02,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
+2026-02-11 20:35:44.194572 ,1.000000,9.192388E-02,5.533727E-03,3.976960E-01,3.469567E+00,0.000000E+00,1.000000E+00,3.818578E-01,2.536157E-02,6.095445E-01,1.556642E-01,0.000000E+00,0.000000E+00,1.556642E-01,0.000000E+00,3.054249E+00,2.369008E+00,-1.323735E-01,1.120689E+00,-9.105252E-01,2.929687E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
+2026-02-11 20:35:44.196995 ,2.000000,1.378858E-01,7.951983E-03,6.077873E-01,3.284891E+00,0.000000E+00,1.000000E+00,6.493960E-01,5.520907E-02,9.871098E-01,3.418355E-01,0.000000E+00,0.000000E+00,3.418355E-01,0.000000E+00,9.550106E+00,7.238872E+00,-5.483415E-02,4.380960E+00,-3.509899E+00,5.845488E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,8.166518E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00
+2026-02-11 20:35:44.199637 ,3.000000,1.838478E-01,1.061156E-02,8.299768E-01,3.350955E+00,0.000000E+00,1.000000E+00,9.808722E-01,1.002919E-01,1.453090E+00,6.248197E-01,4.318221E-05,0.000000E+00,6.247765E-01,1.079086E-07,1.716434E+01,1.270100E+01,1.225169E-01,9.597497E+00,-7.507195E+00,7.118933E-01,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,7.868604E-03,7.868604E-03,1.040100E-02,8.631371E-03,8.631648E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,7.868604E-03,7.868604E-03,1.040100E-02,8.631371E-03,8.631648E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,6.648971E-05,0.000000E+00,8.788844E-05,0.000000E+00,0.000000E+00,7.293509E-05,0.000000E+00,7.293743E-05,0.000000E+00,0.000000E+00,7.293742E-05,0.000000E+00,0.000000E+00,7.284801E-05,1.670795E-06,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,2.715306E-02,6.648971E-05,0.000000E+00,8.788844E-05,0.000000E+00,0.000000E+00,7.293509E-05,0.000000E+00,7.293743E-05,0.000000E+00,0.000000E+00,7.293740E-05,0.000000E+00,0.000000E+00,7.293113E-05
+2026-02-11 20:35:44.202108 ,4.000000,2.000000E-01,1.310093E-02,8.425085E-01,2.886801E+00,1.000000E+00,1.000000E+00,6.421574E-01,1.106641E-01,1.453090E+00,6.872688E-01,6.816741E-05,0.000000E+00,6.872006E-01,1.253516E-07,2.000858E+01,1.468820E+01,9.692551E-02,1.144289E+01,-8.866540E+00,6.303190E-01,9.269634E-04,8.323807E-04,-2.734572E-04,1.104766E-03,8.038164E-04,-5.334637E-04,0.000000E+00,2.793765E-03,3.405374E-03,7.363370E-03,7.363370E-03,1.036321E-02,8.481050E-03,8.486865E-03,0.000000E+00,0.000000E+00,0.000000E+00,2.798283E-03,3.405374E-03,7.363370E-03,7.363370E-03,1.036321E-02,8.481050E-03,8.486869E-03,0.000000E+00,0.000000E+00,0.000000E+00,0.000000E+00,1.196639E-04,0.000000E+00,8.730187E-05,0.000000E+00,0.000000E+00,0.000000E+00,6.222047E-05,0.000000E+00,8.756911E-05,0.000000E+00,0.000000E+00,7.166487E-05,0.000000E+00,7.171401E-05,0.000000E+00,0.000000E+00,7.172550E-05,0.000000E+00,0.000000E+00,7.105263E-05,4.817435E-05,0.000000E+00,8.862358E-07,0.000000E+00,8.762648E-05,0.000000E+00,0.000000E+00,2.028076E-02,6.222047E-05,0.000000E+00,8.756911E-05,0.000000E+00,0.000000E+00,7.166487E-05,0.000000E+00,7.171404E-05,0.000000E+00,0.000000E+00,7.172476E-05,0.000000E+00,0.000000E+00,7.284716E-05
diff --git a/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cuda_order2.cfg b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cuda_order2.cfg
new file mode 100644
index 00000000..1da17b24
--- /dev/null
+++ b/tests/input/validation_tests/SN_solver_hpc/symmetric_hohlraum_hpc_200_cuda_order2.cfg
@@ -0,0 +1,45 @@
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%  Symmetric Hohlraum Validation SN-HPC%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%
+% ---- File specifications ----
+%
+OUTPUT_DIR = ../../../result
+OUTPUT_FILE = symmetric_hohlraum_hpc_200_cuda_order2
+LOG_DIR = ../../../result/logs
+LOG_FILE = symmetric_hohlraum_hpc_200_cuda_order2_output
+MESH_FILE = symmetric_hohlraum_hpc_200.mesh
+FORCE_CONNECTIVITY_RECOMPUTE = YES
+%
+% ---- Problem specifications ----
+%
+PROBLEM = SYMMETRIC_HOHLRAUM
+SPATIAL_DIM = 2
+N_SAMPLING_PTS_LINE_GREEN = 20
+%
+% ---- Solver specifications ----
+%
+HPC_SOLVER = YES
+SOLVER = SN_SOLVER
+CFL_NUMBER = 0.5
+TIME_FINAL = 0.2
+RECONS_ORDER = 2
+TIME_INTEGRATION_ORDER = 2
+%
+% ---- Boundary Conditions ----
+%
+BC_NEUMANN = ( void )
+%
+% ---- Quadrature ----
+%
+QUAD_TYPE = GAUSS_LEGENDRE_TENSORIZED_2D
+QUAD_ORDER = 6
+%
+% ----- Output ----
+%
+VOLUME_OUTPUT = (MINIMAL)
+VOLUME_OUTPUT_FREQUENCY = 0
+SCREEN_OUTPUT = (ITER, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT)
+SCREEN_OUTPUT_FREQUENCY = 1
+HISTORY_OUTPUT = (ITER, SIM_TIME, WALL_TIME, MASS, RMS_FLUX, VTK_OUTPUT, CSV_OUTPUT, CUR_OUTFLOW, TOTAL_OUTFLOW, MAX_OUTFLOW, TOTAL_PARTICLE_ABSORPTION, TOTAL_PARTICLE_ABSORPTION_CENTER, TOTAL_PARTICLE_ABSORPTION_VERTICAL, TOTAL_PARTICLE_ABSORPTION_HORIZONTAL, PROBE_MOMENT_TIME_TRACE, VAR_ABSORPTION_GREEN, ABSORPTION_GREEN_BLOCK, ABSORPTION_GREEN_LINE)
+HISTORY_OUTPUT_FREQUENCY = 1
diff --git a/tests/test_cases.cpp b/tests/test_cases.cpp
index 0a336d8e..383ffa8f 100644
--- a/tests/test_cases.cpp
+++ b/tests/test_cases.cpp
@@ -13,6 +13,9 @@
 #include "common/config.hpp"
 #include "datagenerator/datageneratorbase.hpp"
 #include "solvers/snsolver_hpc.hpp"
+#ifdef KITRT_ENABLE_CUDA_HPC
+#include "solvers/snsolver_hpc_cuda.hpp"
+#endif
 #include "solvers/solverbase.hpp"
 
 using vtkUnstructuredGridReaderSP = vtkSmartPointer<vtkUnstructuredGridReader>;
@@ -518,6 +521,7 @@ void compareHistoryCSV( const std::string& referenceFile, const std::string& tes
             INFO( "Line " << lineNumber << ", token " << idx_token );
             if( refNumeric && tstNumeric ) {
                 double scale = std::max( 1.0, std::max( std::fabs( valueRef ), std::fabs( valueTest ) ) );
+                INFO( "Column " << headerRef[idx_token] << ", ref=" << valueRef << ", test=" << valueTest );
                 REQUIRE( std::fabs( valueRef - valueTest ) <= absTol + relTol * scale );
             }
             else {
@@ -575,6 +579,69 @@ TEST_CASE( "SN_SOLVER_HPC_CSV_QOI_VALIDATION", "[validation_tests][hpc]" ) {
     }
 }
 
+#ifdef KITRT_ENABLE_CUDA_HPC
+TEST_CASE( "SN_SOLVER_HPC_CUDA_QOI_VALIDATION", "[validation_tests][hpc][cuda]" ) {
+    std::string hpcFileDir = "input/validation_tests/SN_solver_hpc/";
+    std::string resultDir  = std::string( TESTS_PATH ) + "result";
+    std::string logDir     = std::string( TESTS_PATH ) + "result/logs";
+
+    auto runCpuAndCollectCSV = [&]( const std::string& configFileName, const std::string& prefix ) {
+        Config* config      = new Config( configFileName );
+        SNSolverHPC* solver = new SNSolverHPC( config );
+        solver->Solve();
+        requireAndFlushTestLoggers();
+        delete solver;
+        delete config;
+        return findNewestCSVFile( logDir, prefix );
+    };
+
+    auto runCudaAndCollectCSV = [&]( const std::string& configFileName, const std::string& prefix ) {
+        Config* config          = new Config( configFileName );
+        SNSolverHPCCUDA* solver = new SNSolverHPCCUDA( config );
+        solver->Solve();
+        requireAndFlushTestLoggers();
+        delete solver;
+        delete config;
+        return findNewestCSVFile( logDir, prefix );
+    };
+
+    SECTION( "lattice_order1_cpu_vs_cuda_all_qois" ) {
+        std::filesystem::remove_all( resultDir );
+        resetTestLoggers();
+        const std::string cpuConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cpu_order1.cfg";
+        const std::string cpuCSV    = runCpuAndCollectCSV( cpuConfig, "lattice_hpc_200_cpu_order1_output" );
+        REQUIRE( !cpuCSV.empty() );
+
+        resetTestLoggers();
+        const std::string cudaConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cuda_order1.cfg";
+        const std::string cudaCSV    = runCudaAndCollectCSV( cudaConfig, "lattice_hpc_200_cuda_order1_output" );
+        REQUIRE( !cudaCSV.empty() );
+
+        INFO( "CPU CSV: " << cpuCSV );
+        INFO( "CUDA CSV: " << cudaCSV );
+        compareHistoryCSV( cpuCSV, cudaCSV );
+    }
+
+    SECTION( "lattice_order2_cpu_vs_cuda_all_qois" ) {
+        std::filesystem::remove_all( resultDir );
+        resetTestLoggers();
+        const std::string cpuConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cpu_order2.cfg";
+        const std::string cpuCSV    = runCpuAndCollectCSV( cpuConfig, "lattice_hpc_200_cpu_order2_output" );
+        REQUIRE( !cpuCSV.empty() );
+
+        resetTestLoggers();
+        const std::string cudaConfig = std::string( TESTS_PATH ) + hpcFileDir + "lattice_hpc_200_cuda_order2.cfg";
+        const std::string cudaCSV    = runCudaAndCollectCSV( cudaConfig, "lattice_hpc_200_cuda_order2_output" );
+        REQUIRE( !cudaCSV.empty() );
+
+        INFO( "CPU CSV: " << cpuCSV );
+        INFO( "CUDA CSV: " << cudaCSV );
+        compareHistoryCSV( cpuCSV, cudaCSV );
+    }
+
+}
+#endif
+
 TEST_CASE( "screen_output", "[output]" ) {
     std::string out_fileDir = "input/validation_tests/output/";
     resetTestLoggers();    // Make sure to write in own logging file
diff --git a/tools/singularity/build_container.sh b/tools/singularity/build_container.sh
old mode 100644
new mode 100755
index 699aef11..1975c056
--- a/tools/singularity/build_container.sh
+++ b/tools/singularity/build_container.sh
@@ -1 +1,21 @@
-singularity build kit_rt.sif kit_rt.def
+#!/usr/bin/env bash
+set -euo pipefail
+
+mode="${1:-cpu}"
+
+case "${mode}" in
+    cpu)
+        singularity build kit_rt.sif kit_rt.def
+        ;;
+    cuda)
+        singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
+        ;;
+    all)
+        singularity build kit_rt.sif kit_rt.def
+        singularity build kit_rt_MPI_cuda.sif kit_rt_MPI_cuda.def
+        ;;
+    *)
+        echo "Usage: $0 [cpu|cuda|all]" >&2
+        exit 1
+        ;;
+esac
diff --git a/tools/singularity/install_kitrt_singularity.sh b/tools/singularity/install_kitrt_singularity.sh
index 6c8299d1..3489812d 100755
--- a/tools/singularity/install_kitrt_singularity.sh
+++ b/tools/singularity/install_kitrt_singularity.sh
@@ -1,4 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
 cd ../../
+mkdir -p build_singularity
 cd build_singularity
-cmake ../
-make -j
\ No newline at end of file
+cmake ..
+make -j
diff --git a/tools/singularity/install_kitrt_singularity_cuda.sh b/tools/singularity/install_kitrt_singularity_cuda.sh
new file mode 100755
index 00000000..b88fe4a1
--- /dev/null
+++ b/tools/singularity/install_kitrt_singularity_cuda.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd ../../
+mkdir -p build_singularity_cuda
+cd build_singularity_cuda
+cmake -DCMAKE_BUILD_TYPE=Release -DBUILD_MPI=OFF -DBUILD_CUDA_HPC=ON -DBUILD_ML=OFF ..
+make -j
diff --git a/tools/singularity/kit_rt_MPI_cuda.def b/tools/singularity/kit_rt_MPI_cuda.def
new file mode 100644
index 00000000..d4490190
--- /dev/null
+++ b/tools/singularity/kit_rt_MPI_cuda.def
@@ -0,0 +1,72 @@
+Bootstrap: docker
+From: nvidia/cuda:12.4.1-devel-ubuntu20.04
+
+%post
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64"
+    export PYTHONPATH=/usr/local/gmsh/lib:$PYTHONPATH
+    export PATH=/usr/local/gmsh/bin:/usr/local/cuda/bin:$PATH
+
+    apt-get update
+    DEBIAN_FRONTEND=noninteractive apt-get install -qq \
+       gcc \
+       g++ \
+       libopenmpi-dev \
+       openmpi-bin \
+       libblas-dev \
+       liblapack-dev \
+       git \
+       make \
+       ninja-build \
+       cmake \
+       wget \
+       ssh \
+       libssl-dev \
+       libxt-dev \
+       libgl1-mesa-dev \
+       libglu1 \
+       libxrender1 \
+       libxcursor-dev \
+       libxft-dev \
+       libxinerama-dev \
+       python3 \
+       python3-pip \
+       doxygen
+
+    apt-get clean
+    apt-get autoremove --purge
+    rm -rf /var/lib/apt/lists/*
+
+    cd /usr/local
+    wget -nc --quiet http://gmsh.info/bin/Linux/gmsh-4.7.0-Linux64-sdk.tgz
+    tar xzf gmsh-4.7.0-Linux64-sdk.tgz
+    mv gmsh-4.7.0-Linux64-sdk gmsh
+    rm gmsh-4.7.0-Linux64-sdk.tgz
+
+    wget -nc --no-check-certificate --quiet https://www.vtk.org/files/release/9.1/VTK-9.1.0.tar.gz
+    tar xzf VTK-9.1.0.tar.gz
+    mkdir VTK-9.1.0/build
+    cd VTK-9.1.0/build
+    cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DBUILD_DOCUMENTATION=OFF -DBUILD_TESTING=OFF ../
+    ninja
+    ninja install > /dev/null
+    cd -
+    rm -rf VTK-*
+
+    FILENAME=libtensorflow-gpu-linux-x86_64-2.7.0.tar.gz
+    wget -q --no-check-certificate https://storage.googleapis.com/tensorflow/libtensorflow/${FILENAME}
+    tar -C /usr/local -xzf ${FILENAME}
+    ldconfig /usr/local/lib
+    rm ${FILENAME}
+
+    pip3 install numpy pygmsh==6.1.1 Pillow pydicom gcovr sphinx_rtd_theme breathe cpp-coveralls coveralls pandas
+
+%environment
+    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib:/usr/local/cuda/lib64"
+    export PYTHONPATH=/usr/local/gmsh/lib:$PYTHONPATH
+    export PATH=/usr/local/gmsh/bin:/usr/local/cuda/bin:$PATH
+
+%post
+    cd /home
+
+%runscript
+    exec /bin/bash
diff --git a/tools/singularity/singularity_run_interactive_cuda.sh b/tools/singularity/singularity_run_interactive_cuda.sh
new file mode 100644
index 00000000..668b1f62
--- /dev/null
+++ b/tools/singularity/singularity_run_interactive_cuda.sh
@@ -0,0 +1 @@
+singularity shell --nv  --bind $(pwd)/../..:/mnt kit_rt_MPI_cuda.sif