From e1ec9ad4d31d7b3af9fc150dd510ee471d5dba5f Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@login03.frontier.olcf.ornl.gov>
Date: Fri, 23 Jan 2026 09:52:59 -0500
Subject: [PATCH 01/21] LLVMFlang workarounds

---
 CMakeLists.txt                                |  44 +-
 load_amd.sh                                   |  39 +-
 src/common/include/shared_parallel_macros.fpp |  10 +
 src/common/m_boundary_common.fpp              | 362 ++++++-----
 src/common/m_chemistry.fpp                    | 415 +++++++------
 src/post_process/m_global_parameters.fpp      |   2 +
 src/pre_process/m_global_parameters.fpp       |   2 +
 src/simulation/include/inline_capillary.fpp   |   9 +-
 src/simulation/m_cbc.fpp                      |  22 +-
 src/simulation/m_data_output.fpp              |  16 +-
 src/simulation/m_global_parameters.fpp        |   2 +
 src/simulation/m_hyperelastic.fpp             |   2 +-
 src/simulation/m_igr.fpp                      |   6 +-
 src/simulation/m_muscl.fpp                    |   7 +-
 src/simulation/m_rhs.fpp                      |  97 +--
 src/simulation/m_riemann_solvers.fpp          | 415 +++++++------
 src/simulation/m_sim_helpers.fpp              |  10 +-
 src/simulation/m_start_up.fpp                 |   9 +-
 src/simulation/m_surface_tension.fpp          |  77 +--
 src/simulation/m_time_steppers.fpp            |   7 +-
 src/simulation/m_viscous.fpp                  | 587 +++++++++---------
 src/simulation/m_weno.fpp                     | 101 +--
 toolchain/mfc/case.py                         |   4 +-
 23 files changed, 1223 insertions(+), 1022 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 751a690b55..0507754dce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -310,7 +310,18 @@ endif()
 # and generate documentation. Instead, we can simply include the list of .f90
 # files that will eventually be used to compile <target>.
 
-macro(HANDLE_SOURCES target useCommon)
+macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP)
+
+    if (${useOpenACC} AND ${useOpenMP})
+      message(FATAL_ERROR "OpenACC and OpenMP at same time is unsupported.")
+    elseif (${useOpenACC})
+      set(MFC_GPU_MODE "OpenACC")
+    elseif (${useOpenMP})
+      set(MFC_GPU_MODE "OpenMP")
+    else()
+      set(MFC_GPU_MODE "")
+    endif()
+
     set(${target}_DIR "${CMAKE_SOURCE_DIR}/src/${target}")
     set(common_DIR    "${CMAKE_SOURCE_DIR}/src/common")
 
@@ -371,7 +382,8 @@ macro(HANDLE_SOURCES target useCommon)
                                  -D MFC_${CMAKE_Fortran_COMPILER_ID}
                                  -D MFC_${${target}_UPPER}
                                  -D MFC_COMPILER="${CMAKE_Fortran_COMPILER_ID}"
-                                 -D MFC_CASE_OPTIMIZATION=False
+                                 -D MFC_GPU_MODE="${MFC_GPU_MODE}"
+				 -D MFC_CASE_OPTIMIZATION=False
                                  -D chemistry=False
                                  --line-numbering
                                  --no-folding
@@ -388,11 +400,10 @@ macro(HANDLE_SOURCES target useCommon)
 endmacro()
 
 
-HANDLE_SOURCES(pre_process ON)
-HANDLE_SOURCES(simulation ON)
-HANDLE_SOURCES(post_process ON)
-HANDLE_SOURCES(syscheck OFF)
-
+HANDLE_SOURCES(pre_process ON OFF OFF)
+HANDLE_SOURCES(simulation ON ${MFC_OpenACC} ${MFC_OpenMP})
+HANDLE_SOURCES(post_process ON OFF OFF)
+HANDLE_SOURCES(syscheck OFF ${MFC_OpenACC} ${MFC_OpenMP})
 
 # MFC_SETUP_TARGET: Given a target (herein <target>), this macro creates a new
 # executable <target> with the appropriate sources, compiler definitions, and
@@ -430,8 +441,9 @@ function(MFC_SETUP_TARGET)
 
     foreach (a_target ${IPO_TARGETS})
         set_target_properties(${a_target} PROPERTIES Fortran_PREPROCESS ON)
+	message(STATUS ${CMAKE_Fortran_COMPILER_ID})
 
-        target_include_directories(${a_target} PRIVATE
+	target_include_directories(${a_target} PRIVATE
             "${CMAKE_SOURCE_DIR}/src/common"
             "${CMAKE_SOURCE_DIR}/src/common/include"
             "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}")
@@ -449,9 +461,9 @@ function(MFC_SETUP_TARGET)
 
         if (MFC_MPI AND ARGS_MPI)
             find_package(MPI COMPONENTS Fortran REQUIRED)
-
-            target_compile_definitions(${a_target} PRIVATE MFC_MPI)
-            target_link_libraries     (${a_target} PRIVATE MPI::MPI_Fortran)
+	    
+	    target_compile_definitions(${a_target} PRIVATE $ENV{CRAY_MPICH_INC})
+            target_link_libraries     (${a_target} PRIVATE $ENV{CRAY_MPICH_LIB})
         endif()
 
         if (ARGS_SILO)
@@ -470,9 +482,9 @@ function(MFC_SETUP_TARGET)
                     find_package(CUDAToolkit REQUIRED)
                     target_link_libraries(${a_target} PRIVATE CUDA::cudart CUDA::cufft)
                 else()
-                    find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
-                    target_link_libraries(${a_target} PRIVATE hipfort::hipfft)
-                endif()
+			#find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE $ENV{CRAY_HIPFORT_INC})
+		endif()
             else()
                 find_package(FFTW REQUIRED)
                 target_link_libraries(${a_target} PRIVATE FFTW::FFTW)
@@ -517,8 +529,8 @@ function(MFC_SETUP_TARGET)
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
-                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_INC})
+		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_LIB})
                 endif()
             endif()
 
diff --git a/load_amd.sh b/load_amd.sh
index 550da1687d..7137c18fc4 100644
--- a/load_amd.sh
+++ b/load_amd.sh
@@ -1,7 +1,32 @@
-module use /ccs/home/bcornille/afar-drops/modulefiles/Core/
-module load rocm-afar-drop mpich cray-python
-module load cmake
-#export OMPX_APU_MAPS=0
-#export HSA_XNACK=0
-#export LIBOMPTARGET_INFO=0
-#export AMD_LOG_LEVEL=1
+module load python cmake
+module load cpe/25.09
+module load PrgEnv-amd
+
+AFAR_UMS_BASEDIR="/sw/crusher/ums/compilers/afar"
+AFAR_UMS_LATEST=$(ls -d --color=never ${AFAR_UMS_BASEDIR}/*/ | tail -n1)
+export OLCF_AFAR_ROOT=${AFAR_UMS_BASEDIR}/"rocm-afar-8873-drop-22.2.0"
+
+export PATH=${OLCF_AFAR_ROOT}/lib/llvm/bin:${PATH}
+export LD_LIBRARY_PATH=${OLCF_AFAR_ROOT}/lib:${OLCF_AFAR_ROOT}/lib/llvm/lib:${LD_LIBRARY_PATH}
+
+export CRAY_MPICH_INC="-I${OLCF_AFAR_ROOT}/include/mpich3.4a2"
+export CRAY_HIPFORT_INC="-I${OLCF_AFAR_ROOT}/include/hipfort/amdgcn"
+export CRAY_HIP_INC="-I${OLCF_AFAR_ROOT}/include/hip"
+export CRAY_MPICH_LIB="-L${CRAY_MPICH_PREFIX}/lib \
+                        ${CRAY_PMI_POST_LINK_OPTS} \
+                        -lmpifort_amd -lmpi_amd -lmpi -lpmi -lpmi2"
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_LD_LIBRARY_PATH}"
+export CMAKE_PREFIX_PATH="${OLCF_AFAR_ROOT}:${CMAKE_PREFIX_PATH}"
+export FC="${OLCF_AFAR_ROOT}/bin/amdflang"
+
+unset MPICH_GPU_SUPPORT_ENABLED
+
+# module use /ccs/home/bcornille/afar-drops/modulefiles/Core/
+# module load rocm-afar-drop mpich cray-python
+# module load cmake
+
+export OMPX_APU_MAPS=0
+export HSA_XNACK=0
+export LIBOMPTARGET_INFO=0
+export AMD_LOG_LEVEL=0
+export OFFLOAD_TRACK_ALLOCATION_TRACES=false
diff --git a/src/common/include/shared_parallel_macros.fpp b/src/common/include/shared_parallel_macros.fpp
index a3a0b6f753..36bee0a23a 100644
--- a/src/common/include/shared_parallel_macros.fpp
+++ b/src/common/include/shared_parallel_macros.fpp
@@ -1,3 +1,13 @@
+#:set NVIDIA_COMPILER_ID="NVHPC"
+#:set PGI_COMPILER_ID="PGI"
+#:set INTEL_COMPILER_ID="Intel"
+#:set CCE_COMPILER_ID="Cray"
+#:set AMD_COMPILER_ID="LLVMFlang"
+
+#:set USING_NVHPC = (MFC_COMPILER == NVIDIA_COMPILER_ID or MFC_COMPILER == PGI_COMPILER_ID)
+#:set USING_CCE = (MFC_COMPILER == CCE_COMPILER_ID)
+#:set USING_AMD = (MFC_COMPILER == AMD_COMPILER_ID)
+
 #:def ASSERT_LIST(data, datatype)
     #:assert data is not None
     #:assert isinstance(data, list)
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 833bcbb908..4fccf3aa68 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -152,68 +152,74 @@ contains
 
         if (n == 0) return
 
-        if (bc_y%beg >= 0) then
-            call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, -1, sys_size, pb_in, mv_in)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    select case (int(bc_type(2, 1)%sf(k, 0, l)))
-                    case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
-                        call s_ghost_cell_extrapolation(q_prim_vf, 2, -1, k, l)
-                    case (BC_AXIS)
-                        call s_axis(q_prim_vf, pb_in, mv_in, k, l)
-                    case (BC_REFLECTIVE)
-                        call s_symmetry(q_prim_vf, 2, -1, k, l, pb_in, mv_in)
-                    case (BC_PERIODIC)
-                        call s_periodic(q_prim_vf, 2, -1, k, l, pb_in, mv_in)
-                    case (BC_SLIP_WALL)
-                        call s_slip_wall(q_prim_vf, 2, -1, k, l)
-                    case (BC_NO_SLIP_WALL)
-                        call s_no_slip_wall(q_prim_vf, 2, -1, k, l)
-                    case (BC_DIRICHLET)
-                        call s_dirichlet(q_prim_vf, 2, -1, k, l)
-                    end select
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+        
+            if (bc_y%beg >= 0) then
+                call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, -1, sys_size, pb_in, mv_in)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        select case (int(bc_type(2, 1)%sf(k, 0, l)))
+                        case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
+                            call s_ghost_cell_extrapolation(q_prim_vf, 2, -1, k, l)
+                        case (BC_AXIS)
+                            call s_axis(q_prim_vf, pb_in, mv_in, k, l)
+                        case (BC_REFLECTIVE)
+                            call s_symmetry(q_prim_vf, 2, -1, k, l, pb_in, mv_in)
+                        case (BC_PERIODIC)
+                            call s_periodic(q_prim_vf, 2, -1, k, l, pb_in, mv_in)
+                        case (BC_SLIP_WALL)
+                            call s_slip_wall(q_prim_vf, 2, -1, k, l)
+                        case (BC_NO_SLIP_WALL)
+                            call s_no_slip_wall(q_prim_vf, 2, -1, k, l)
+                        case (BC_DIRICHLET)
+                            call s_dirichlet(q_prim_vf, 2, -1, k, l)
+                        end select
 
-                    if (qbmm .and. (.not. polytropic) .and. &
-                        (bc_type(2, 1)%sf(k, 0, l) <= BC_GHOST_EXTRAP) .and. &
-                        (bc_type(2, 1)%sf(k, 0, l) /= BC_AXIS)) then
-                        call s_qbmm_extrapolation(2, -1, k, l, pb_in, mv_in)
-                    end if
+                        if (qbmm .and. (.not. polytropic) .and. &
+                            (bc_type(2, 1)%sf(k, 0, l) <= BC_GHOST_EXTRAP) .and. &
+                            (bc_type(2, 1)%sf(k, 0, l) /= BC_AXIS)) then
+                            call s_qbmm_extrapolation(2, -1, k, l, pb_in, mv_in)
+                        end if
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
 
-        if (bc_y%end >= 0) then
-            call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, 1, sys_size, pb_in, mv_in)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    select case (int(bc_type(2, 2)%sf(k, 0, l)))
-                    case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
-                        call s_ghost_cell_extrapolation(q_prim_vf, 2, 1, k, l)
-                    case (BC_REFLECTIVE)
-                        call s_symmetry(q_prim_vf, 2, 1, k, l, pb_in, mv_in)
-                    case (BC_PERIODIC)
-                        call s_periodic(q_prim_vf, 2, 1, k, l, pb_in, mv_in)
-                    case (BC_SLIP_WALL)
-                        call s_slip_wall(q_prim_vf, 2, 1, k, l)
-                    case (BC_NO_SLIP_WALL)
-                        call s_no_slip_wall(q_prim_vf, 2, 1, k, l)
-                    case (BC_DIRICHLET)
-                        call s_dirichlet(q_prim_vf, 2, 1, k, l)
-                    end select
+         
 
-                    if (qbmm .and. (.not. polytropic) .and. &
-                        (bc_type(2, 2)%sf(k, 0, l) <= BC_GHOST_EXTRAP)) then
-                        call s_qbmm_extrapolation(2, 1, k, l, pb_in, mv_in)
-                    end if
+            if (bc_y%end >= 0) then
+                call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, 1, sys_size, pb_in, mv_in)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        select case (int(bc_type(2, 2)%sf(k, 0, l)))
+                        case (BC_CHAR_SUP_OUTFLOW:BC_GHOST_EXTRAP)
+                            call s_ghost_cell_extrapolation(q_prim_vf, 2, 1, k, l)
+                        case (BC_REFLECTIVE)
+                            call s_symmetry(q_prim_vf, 2, 1, k, l, pb_in, mv_in)
+                        case (BC_PERIODIC)
+                            call s_periodic(q_prim_vf, 2, 1, k, l, pb_in, mv_in)
+                        case (BC_SLIP_WALL)
+                            call s_slip_wall(q_prim_vf, 2, 1, k, l)
+                        case (BC_NO_SLIP_WALL)
+                            call s_no_slip_wall(q_prim_vf, 2, 1, k, l)
+                        case (BC_DIRICHLET)
+                            call s_dirichlet(q_prim_vf, 2, 1, k, l)
+                        end select
+
+                        if (qbmm .and. (.not. polytropic) .and. &
+                            (bc_type(2, 2)%sf(k, 0, l) <= BC_GHOST_EXTRAP)) then
+                            call s_qbmm_extrapolation(2, 1, k, l, pb_in, mv_in)
+                        end if
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        #:endif
 
         ! Population of Buffers in z-direction
 
@@ -1208,44 +1214,48 @@ contains
 
         if (n == 0) return
 
-        !< y-direction
-        if (bc_y%beg >= 0) then
-            call s_mpi_sendrecv_variables_buffers(c_divs, 2, -1, num_dims + 1)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    select case (bc_type(2, 1)%sf(k, 0, l))
-                    case (BC_PERIODIC)
-                        call s_color_function_periodic(c_divs, 2, -1, k, l)
-                    case (BC_REFLECTIVE)
-                        call s_color_function_reflective(c_divs, 2, -1, k, l)
-                    case default
-                        call s_color_function_ghost_cell_extrapolation(c_divs, 2, -1, k, l)
-                    end select
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+
+            !< y-direction
+            if (bc_y%beg >= 0) then
+                call s_mpi_sendrecv_variables_buffers(c_divs, 2, -1, num_dims + 1)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        select case (bc_type(2, 1)%sf(k, 0, l))
+                        case (BC_PERIODIC)
+                            call s_color_function_periodic(c_divs, 2, -1, k, l)
+                        case (BC_REFLECTIVE)
+                            call s_color_function_reflective(c_divs, 2, -1, k, l)
+                        case default
+                            call s_color_function_ghost_cell_extrapolation(c_divs, 2, -1, k, l)
+                        end select
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
 
-        if (bc_y%end >= 0) then
-            call s_mpi_sendrecv_variables_buffers(c_divs, 2, 1, num_dims + 1)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = -buff_size, m + buff_size
-                    select case (bc_type(2, 2)%sf(k, 0, l))
-                    case (BC_PERIODIC)
-                        call s_color_function_periodic(c_divs, 2, 1, k, l)
-                    case (BC_REFLECTIVE)
-                        call s_color_function_reflective(c_divs, 2, 1, k, l)
-                    case default
-                        call s_color_function_ghost_cell_extrapolation(c_divs, 2, 1, k, l)
-                    end select
+            if (bc_y%end >= 0) then
+                call s_mpi_sendrecv_variables_buffers(c_divs, 2, 1, num_dims + 1)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = -buff_size, m + buff_size
+                        select case (bc_type(2, 2)%sf(k, 0, l))
+                        case (BC_PERIODIC)
+                            call s_color_function_periodic(c_divs, 2, 1, k, l)
+                        case (BC_REFLECTIVE)
+                            call s_color_function_reflective(c_divs, 2, 1, k, l)
+                        case default
+                            call s_color_function_ghost_cell_extrapolation(c_divs, 2, 1, k, l)
+                        end select
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        #:endif
 
         if (p == 0) return
 
@@ -1539,58 +1549,62 @@ contains
 
         end if
 
-        if (n == 0) then
-            return
-        else if (bc_y%beg >= 0) then
-            call s_mpi_sendrecv_variables_buffers(jac_sf, 2, -1, 1)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = idwbuff(1)%beg, idwbuff(1)%end
-                    select case (bc_type(2, 1)%sf(k, 0, l))
-                    case (BC_PERIODIC)
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, n - j + 1, l)
-                        end do
-                    case (BC_REFLECTIVE)
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, j - 1, l)
-                        end do
-                    case default
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, 0, l)
-                        end do
-                    end select
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+
+            if (n == 0) then
+                return
+            else if (bc_y%beg >= 0) then
+                call s_mpi_sendrecv_variables_buffers(jac_sf, 2, -1, 1)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = idwbuff(1)%beg, idwbuff(1)%end
+                        select case (bc_type(2, 1)%sf(k, 0, l))
+                        case (BC_PERIODIC)
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, n - j + 1, l)
+                            end do
+                        case (BC_REFLECTIVE)
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, j - 1, l)
+                            end do
+                        case default
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, -j, l) = jac_sf(1)%sf(k, 0, l)
+                            end do
+                        end select
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
+                $:END_GPU_PARALLEL_LOOP()
 
-        end if
+            end if
 
-        if (bc_y%end >= 0) then
-            call s_mpi_sendrecv_variables_buffers(jac_sf, 2, 1, 1)
-        else
-            $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
-            do l = 0, p
-                do k = idwbuff(1)%beg, idwbuff(1)%end
-                    select case (bc_type(2, 2)%sf(k, 0, l))
-                    case (BC_PERIODIC)
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, j - 1, l)
-                        end do
-                    case (BC_REFLECTIVE)
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n - (j - 1), l)
-                        end do
-                    case default
-                        do j = 1, buff_size
-                            jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n, l)
-                        end do
-                    end select
+            if (bc_y%end >= 0) then
+                call s_mpi_sendrecv_variables_buffers(jac_sf, 2, 1, 1)
+            else
+                $:GPU_PARALLEL_LOOP(private='[l,k]', collapse=2)
+                do l = 0, p
+                    do k = idwbuff(1)%beg, idwbuff(1)%end
+                        select case (bc_type(2, 2)%sf(k, 0, l))
+                        case (BC_PERIODIC)
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, j - 1, l)
+                            end do
+                        case (BC_REFLECTIVE)
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n - (j - 1), l)
+                            end do
+                        case default
+                            do j = 1, buff_size
+                                jac_sf(1)%sf(k, n + j, l) = jac_sf(1)%sf(k, n, l)
+                            end do
+                        end select
+                    end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+
+        #:endif
 
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
             if (p == 0) then
@@ -1913,27 +1927,35 @@ contains
             end do
         end do
 
-        if (n > 0) then
-            do k = 0, p
-                do j = 1, sys_size
-                    do i = 0, m
-                        bc_buffers(2, 1)%sf(i, j, k) = q_prim_vf(j)%sf(i, 0, k)
-                        bc_buffers(2, 2)%sf(i, j, k) = q_prim_vf(j)%sf(i, n, k)
-                    end do
-                end do
-            end do
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
 
-            if (p > 0) then
-                do k = 1, sys_size
-                    do j = 0, n
+            if (n > 0) then
+                do k = 0, p
+                    do j = 1, sys_size
                         do i = 0, m
-                            bc_buffers(3, 1)%sf(i, j, k) = q_prim_vf(k)%sf(i, j, 0)
-                            bc_buffers(3, 2)%sf(i, j, k) = q_prim_vf(k)%sf(i, j, p)
+                            bc_buffers(2, 1)%sf(i, j, k) = q_prim_vf(j)%sf(i, 0, k)
+                            bc_buffers(2, 2)%sf(i, j, k) = q_prim_vf(j)%sf(i, n, k)
                         end do
                     end do
                 end do
+
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+
+                    if (p > 0) then
+                        do k = 1, sys_size
+                            do j = 0, n
+                                do i = 0, m
+                                    bc_buffers(3, 1)%sf(i, j, k) = q_prim_vf(k)%sf(i, j, 0)
+                                    bc_buffers(3, 2)%sf(i, j, k) = q_prim_vf(k)%sf(i, j, p)
+                                end do
+                            end do
+                        end do
+                    end if
+
+                #:endif
             end if
-        end if
+
+        #:endif
 
     end subroutine s_pack_boundary_condition_buffers
 
@@ -1945,18 +1967,20 @@ contains
         bc_type(1, 2)%sf(:, :, :) = int(min(bc_x%end, 0), kind=1)
         $:GPU_UPDATE(device='[bc_type(1,1)%sf,bc_type(1,2)%sf]')
 
-        if (n > 0) then
-            bc_type(2, 1)%sf(:, :, :) = int(min(bc_y%beg, 0), kind=1)
-            bc_type(2, 2)%sf(:, :, :) = int(min(bc_y%end, 0), kind=1)
-            $:GPU_UPDATE(device='[bc_type(2,1)%sf,bc_type(2,2)%sf]')
-            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                if (p > 0) then
-                    bc_type(3, 1)%sf(:, :, :) = int(min(bc_z%beg, 0), kind=1)
-                    bc_type(3, 2)%sf(:, :, :) = int(min(bc_z%end, 0), kind=1)
-                    $:GPU_UPDATE(device='[bc_type(3,1)%sf,bc_type(3,2)%sf]')
-                end if
-            #:endif
-        end if
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (n > 0) then
+                bc_type(2, 1)%sf(:, :, :) = int(min(bc_y%beg, 0), kind=1)
+                bc_type(2, 2)%sf(:, :, :) = int(min(bc_y%end, 0), kind=1)
+                $:GPU_UPDATE(device='[bc_type(2,1)%sf,bc_type(2,2)%sf]')
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                    if (p > 0) then
+                        bc_type(3, 1)%sf(:, :, :) = int(min(bc_z%beg, 0), kind=1)
+                        bc_type(3, 2)%sf(:, :, :) = int(min(bc_z%end, 0), kind=1)
+                        $:GPU_UPDATE(device='[bc_type(3,1)%sf,bc_type(3,2)%sf]')
+                    end if
+                #:endif
+            end if
+        #:endif
 
     end subroutine s_assign_default_bc_type
 
@@ -2155,14 +2179,18 @@ contains
         if (bc_io) then
             deallocate (bc_buffers(1, 1)%sf)
             deallocate (bc_buffers(1, 2)%sf)
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             if (n > 0) then
                 deallocate (bc_buffers(2, 1)%sf)
                 deallocate (bc_buffers(2, 2)%sf)
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                 if (p > 0) then
                     deallocate (bc_buffers(3, 1)%sf)
                     deallocate (bc_buffers(3, 2)%sf)
                 end if
+                #:endif
             end if
+            #:endif
         end if
 
         deallocate (bc_buffers)
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 8339766813..5d3d1b7811 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -19,12 +19,12 @@ module m_chemistry
 
     implicit none
 
-    #:block DEF_AMD
+    #:if USING_AMD
         real(wp) :: molecular_weights_nonparameter(10) = &
                     (/2.016, 1.008, 15.999, 31.998, 17.007, 18.015, 33.006, &
                       34.014, 39.95, 28.014/)
         $:GPU_DECLARE(create='[molecular_weights_nonparameter]')
-    #:endblock DEF_AMD
+    #:endif
 
     type(int_bounds_info) :: isc1, isc2, isc3
     $:GPU_DECLARE(create='[isc1, isc2, isc3]')
@@ -129,7 +129,7 @@ contains
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega, omega_m]', copyin='[bounds]')
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]')
         do z = bounds(3)%beg, bounds(3)%end
             do y = bounds(2)%beg, bounds(2)%end
                 do x = bounds(1)%beg, bounds(1)%end
@@ -146,12 +146,11 @@ contains
 
                     $:GPU_LOOP(parallelism='[seq]')
                     do eqn = chemxb, chemxe
-                        #:block UNDEF_AMD
-                            omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
-                        #:endblock UNDEF_AMD
-                        #:block DEF_AMD
+                        #:if USING_AMD
                             omega_m = molecular_weights_nonparameter(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
-                        #:endblock DEF_AMD
+                        #:else
+                            omega_m = molecular_weights(eqn - chemxb + 1)*omega(eqn - chemxb + 1)
+                        #:endif
                         rhs_vf(eqn)%sf(x, y, z) = rhs_vf(eqn)%sf(x, y, z) + omega_m
 
                     end do
@@ -189,218 +188,224 @@ contains
 
         $:GPU_UPDATE(device='[isc1,isc2,isc3]')
 
-        if (chemistry) then
+        if (chemistry .or. dummy) then
 
             ! Set offsets based on direction using array indexing
             offsets = 0
             offsets(idir) = 1
             ! Model 1: Mixture-Average Transport
             if (chem_params%transport_model == 1) then
-                #:block UNDEF_AMD
-                    ! Note: Added 'i' and 'eqn' to private list.
-                    $:GPU_PARALLEL_LOOP(collapse=3,  private='[x,y,z,i,eqn,Ys_L, Ys_R, Ys_cell, Xs_L, Xs_R, mass_diffusivities_mixavg1, mass_diffusivities_mixavg2, mass_diffusivities_mixavg_Cell, h_l, h_r, Xs_cell, h_k, dXk_dxi,Mass_Diffu_Flux, Mass_Diffu_Energy, MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, T_L, T_R, P_L, P_R, rho_L, rho_R, rho_cell, rho_Vic, lambda_L, lambda_R, lambda_Cell, dT_dxi, grid_spacing]', copyin='[offsets]')
-                    do z = isc3%beg, isc3%end
-                        do y = isc2%beg, isc2%end
-                            do x = isc1%beg, isc1%end
-                                ! Calculate grid spacing using direction-based indexing
-                                select case (idir)
-                                case (1)
-                                    grid_spacing = x_cc(x + 1) - x_cc(x)
-                                case (2)
-                                    grid_spacing = y_cc(y + 1) - y_cc(y)
-                                case (3)
-                                    grid_spacing = z_cc(z + 1) - z_cc(z)
-                                end select
-
-                                ! Extract species mass fractions
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z)
-                                    Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-                                    Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1))
-                                end do
-
-                                ! Calculate molecular weights and mole fractions
-                                call get_mixture_molecular_weight(Ys_L, MW_L)
-                                call get_mixture_molecular_weight(Ys_R, MW_R)
-                                MW_cell = 0.5_wp*(MW_L + MW_R)
-
-                                call get_mole_fractions(MW_L, Ys_L, Xs_L)
-                                call get_mole_fractions(MW_R, Ys_R, Xs_R)
-
-                                ! Calculate gas constants and thermodynamic properties
-                                Rgas_L = gas_constant/MW_L
-                                Rgas_R = gas_constant/MW_R
-
-                                P_L = q_prim_qp(E_idx)%sf(x, y, z)
-                                P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-
-                                rho_L = q_prim_qp(1)%sf(x, y, z)
-                                rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-
-                                T_L = P_L/rho_L/Rgas_L
-                                T_R = P_R/rho_R/Rgas_R
-
-                                rho_cell = 0.5_wp*(rho_L + rho_R)
-                                dT_dxi = (T_R - T_L)/grid_spacing
-
-                                ! Get transport properties
-                                call get_species_mass_diffusivities_mixavg(P_L, T_L, Ys_L, mass_diffusivities_mixavg1)
-                                call get_species_mass_diffusivities_mixavg(P_R, T_R, Ys_R, mass_diffusivities_mixavg2)
-
-                                call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L)
-                                call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R)
-
-                                call get_species_enthalpies_rt(T_L, h_l)
-                                call get_species_enthalpies_rt(T_R, h_r)
-
-                                ! Calculate species properties and gradients
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1)
-                                    h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1)
-                                    Xs_cell(i - chemxb + 1) = 0.5_wp*(Xs_L(i - chemxb + 1) + Xs_R(i - chemxb + 1))
-                                    h_k(i - chemxb + 1) = 0.5_wp*(h_l(i - chemxb + 1) + h_r(i - chemxb + 1))
-                                    dXk_dxi(i - chemxb + 1) = (Xs_R(i - chemxb + 1) - Xs_L(i - chemxb + 1))/grid_spacing
-                                end do
-
-                                ! Calculate mixture-averaged diffusivities
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    mass_diffusivities_mixavg_Cell(i - chemxb + 1) = &
-                                        (mass_diffusivities_mixavg2(i - chemxb + 1) + mass_diffusivities_mixavg1(i - chemxb + 1))/2.0_wp
-                                end do
-
-                                lambda_Cell = 0.5_wp*(lambda_R + lambda_L)
-
-                                ! Calculate mass diffusion fluxes
-                                rho_Vic = 0.0_wp
-                                Mass_Diffu_Energy = 0.0_wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do eqn = chemxb, chemxe
-                                    Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
-                                                                        molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
-                                    rho_Vic = rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1)
-                                    Mass_Diffu_Energy = Mass_Diffu_Energy + h_k(eqn - chemxb + 1)*Mass_Diffu_Flux(eqn - chemxb + 1)
-                                end do
-
-                                ! Apply corrections for mass conservation
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do eqn = chemxb, chemxe
-                                    Mass_Diffu_Energy = Mass_Diffu_Energy - h_k(eqn - chemxb + 1)*Ys_cell(eqn - chemxb + 1)*rho_Vic
-                                    Mass_Diffu_Flux(eqn - chemxb + 1) = Mass_Diffu_Flux(eqn - chemxb + 1) - rho_Vic*Ys_cell(eqn - chemxb + 1)
-                                end do
-
-                                ! Add thermal conduction contribution
-                                Mass_Diffu_Energy = lambda_Cell*dT_dxi + Mass_Diffu_Energy
-
-                                ! Update flux arrays
-                                flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do eqn = chemxb, chemxe
-                                    flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_Diffu_Flux(eqn - chemxb + 1)
-                                end do
+                ! Note: Added 'i' and 'eqn' to private list.
+                $:GPU_PARALLEL_LOOP(collapse=3,  private='[x,y,z,i,eqn,Ys_L, Ys_R, Ys_cell, Xs_L, Xs_R, mass_diffusivities_mixavg1, mass_diffusivities_mixavg2, mass_diffusivities_mixavg_Cell, h_l, h_r, Xs_cell, h_k, dXk_dxi,Mass_Diffu_Flux, Mass_Diffu_Energy, MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, T_L, T_R, P_L, P_R, rho_L, rho_R, rho_cell, rho_Vic, lambda_L, lambda_R, lambda_Cell, dT_dxi, grid_spacing]', copyin='[offsets]')
+                do z = isc3%beg, isc3%end
+                    do y = isc2%beg, isc2%end
+                        do x = isc1%beg, isc1%end
+                            ! Calculate grid spacing using direction-based indexing
+                            select case (idir)
+                            case (1)
+                                grid_spacing = x_cc(x + 1) - x_cc(x)
+                            case (2)
+                                grid_spacing = y_cc(y + 1) - y_cc(y)
+                            case (3)
+                                grid_spacing = z_cc(z + 1) - z_cc(z)
+                            end select
+
+                            ! Extract species mass fractions
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = chemxb, chemxe
+                                Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z)
+                                Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+                                Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1))
+                            end do
+
+                            ! Calculate molecular weights and mole fractions
+                            call get_mixture_molecular_weight(Ys_L, MW_L)
+                            call get_mixture_molecular_weight(Ys_R, MW_R)
+                            MW_cell = 0.5_wp*(MW_L + MW_R)
+
+                            call get_mole_fractions(MW_L, Ys_L, Xs_L)
+                            call get_mole_fractions(MW_R, Ys_R, Xs_R)
+
+                            ! Calculate gas constants and thermodynamic properties
+                            Rgas_L = gas_constant/MW_L
+                            Rgas_R = gas_constant/MW_R
+
+                            P_L = q_prim_qp(E_idx)%sf(x, y, z)
+                            P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+
+                            rho_L = q_prim_qp(1)%sf(x, y, z)
+                            rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+
+                            T_L = P_L/rho_L/Rgas_L
+                            T_R = P_R/rho_R/Rgas_R
+
+                            rho_cell = 0.5_wp*(rho_L + rho_R)
+                            dT_dxi = (T_R - T_L)/grid_spacing
+
+                            ! Get transport properties
+                            call get_species_mass_diffusivities_mixavg(P_L, T_L, Ys_L, mass_diffusivities_mixavg1)
+                            call get_species_mass_diffusivities_mixavg(P_R, T_R, Ys_R, mass_diffusivities_mixavg2)
+
+                            call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L)
+                            call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R)
+
+                            call get_species_enthalpies_rt(T_L, h_l)
+                            call get_species_enthalpies_rt(T_R, h_r)
+
+                            ! Calculate species properties and gradients
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = chemxb, chemxe
+                                #:if USING_AMD
+                                h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights_nonparameter(i - chemxb + 1)
+                                h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights_nonparameter(i - chemxb + 1)
+                                #:else
+                                h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1)
+                                h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1)
+                                #:endif
+                                Xs_cell(i - chemxb + 1) = 0.5_wp*(Xs_L(i - chemxb + 1) + Xs_R(i - chemxb + 1))
+                                h_k(i - chemxb + 1) = 0.5_wp*(h_l(i - chemxb + 1) + h_r(i - chemxb + 1))
+                                dXk_dxi(i - chemxb + 1) = (Xs_R(i - chemxb + 1) - Xs_L(i - chemxb + 1))/grid_spacing
+                            end do
+
+                            ! Calculate mixture-averaged diffusivities
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = chemxb, chemxe
+                                mass_diffusivities_mixavg_Cell(i - chemxb + 1) = &
+                                    (mass_diffusivities_mixavg2(i - chemxb + 1) + mass_diffusivities_mixavg1(i - chemxb + 1))/2.0_wp
+                            end do
+
+                            lambda_Cell = 0.5_wp*(lambda_R + lambda_L)
+
+                            ! Calculate mass diffusion fluxes
+                            rho_Vic = 0.0_wp
+                            Mass_Diffu_Energy = 0.0_wp
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do eqn = chemxb, chemxe
+                                #:if USING_AMD
+                                Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
+                                                                    molecular_weights_nonparameter(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
+                                #:else
+                                Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
+                                                                    molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
+                                #:endif
+                                rho_Vic = rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1)
+                                Mass_Diffu_Energy = Mass_Diffu_Energy + h_k(eqn - chemxb + 1)*Mass_Diffu_Flux(eqn - chemxb + 1)
+                            end do
+
+                            ! Apply corrections for mass conservation
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do eqn = chemxb, chemxe
+                                Mass_Diffu_Energy = Mass_Diffu_Energy - h_k(eqn - chemxb + 1)*Ys_cell(eqn - chemxb + 1)*rho_Vic
+                                Mass_Diffu_Flux(eqn - chemxb + 1) = Mass_Diffu_Flux(eqn - chemxb + 1) - rho_Vic*Ys_cell(eqn - chemxb + 1)
+                            end do
+
+                            ! Add thermal conduction contribution
+                            Mass_Diffu_Energy = lambda_Cell*dT_dxi + Mass_Diffu_Energy
+
+                            ! Update flux arrays
+                            flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do eqn = chemxb, chemxe
+                                flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_Diffu_Flux(eqn - chemxb + 1)
                             end do
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:endblock UNDEF_AMD
+                end do
+                $:END_GPU_PARALLEL_LOOP()
 
                 ! Model 2: Unity Lewis Number
             else if (chem_params%transport_model == 2) then
-                #:block UNDEF_AMD
-                    ! Note: Added ALL scalars and 'i'/'eqn' to private list to prevent race conditions.
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[x,y,z,i,eqn,Ys_L, Ys_R, Ys_cell, dYk_dxi, Mass_Diffu_Flux, grid_spacing, MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, P_L, P_R, rho_L, rho_R, rho_cell, T_L, T_R, Cp_L, Cp_R, hmix_L, hmix_R, dh_dxi, lambda_L, lambda_R, lambda_Cell, diffusivity_L, diffusivity_R, diffusivity_cell, Mass_Diffu_Energy]', copyin='[offsets]')
-                    do z = isc3%beg, isc3%end
-                        do y = isc2%beg, isc2%end
-                            do x = isc1%beg, isc1%end
-                                ! Calculate grid spacing using direction-based indexing
-                                select case (idir)
-                                case (1)
-                                    grid_spacing = x_cc(x + 1) - x_cc(x)
-                                case (2)
-                                    grid_spacing = y_cc(y + 1) - y_cc(y)
-                                case (3)
-                                    grid_spacing = z_cc(z + 1) - z_cc(z)
-                                end select
-
-                                ! Extract species mass fractions
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z)
-                                    Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-                                    Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1))
-                                end do
-
-                                ! Calculate molecular weights and mole fractions
-                                call get_mixture_molecular_weight(Ys_L, MW_L)
-                                call get_mixture_molecular_weight(Ys_R, MW_R)
-                                MW_cell = 0.5_wp*(MW_L + MW_R)
-
-                                ! Calculate gas constants and thermodynamic properties
-                                Rgas_L = gas_constant/MW_L
-                                Rgas_R = gas_constant/MW_R
-
-                                P_L = q_prim_qp(E_idx)%sf(x, y, z)
-                                P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-
-                                rho_L = q_prim_qp(1)%sf(x, y, z)
-                                rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
-
-                                T_L = P_L/rho_L/Rgas_L
-                                T_R = P_R/rho_R/Rgas_R
-
-                                rho_cell = 0.5_wp*(rho_L + rho_R)
-
-                                call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
-                                call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
-                                call get_mixture_enthalpy_mass(T_L, Ys_L, hmix_L)
-                                call get_mixture_enthalpy_mass(T_R, Ys_R, hmix_R)
-                                dh_dxi = (hmix_R - hmix_L)/grid_spacing
-
-                                ! Get transport properties
-                                call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L)
-                                call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R)
-
-                                ! Calculate species properties and gradients
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = chemxb, chemxe
-                                    dYk_dxi(i - chemxb + 1) = (Ys_R(i - chemxb + 1) - &
-                                                               Ys_L(i - chemxb + 1))/grid_spacing
-                                end do
-
-                                ! Calculate mixture-averaged diffusivities
-                                diffusivity_L = lambda_L/rho_L/Cp_L
-                                diffusivity_R = lambda_R/rho_R/Cp_R
-
-                                lambda_Cell = 0.5_wp*(lambda_R + lambda_L)
-                                diffusivity_cell = 0.5_wp*(diffusivity_R + diffusivity_L)
-
-                                ! Calculate mass diffusion fluxes
-                                Mass_Diffu_Energy = 0.0_wp
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do eqn = chemxb, chemxe
-                                    Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell* &
-                                                                        diffusivity_cell* &
-                                                                        dYk_dxi(eqn - chemxb + 1)
-                                end do
-                                Mass_Diffu_Energy = rho_cell*diffusivity_cell*dh_dxi
-
-                                ! Update flux arrays
-                                flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy
-
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do eqn = chemxb, chemxe
-                                    flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_Diffu_Flux(eqn - chemxb + 1)
-                                end do
+                ! Note: Added ALL scalars and 'i'/'eqn' to private list to prevent race conditions.
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[x,y,z,i,eqn,Ys_L, Ys_R, Ys_cell, dYk_dxi, Mass_Diffu_Flux, grid_spacing, MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, P_L, P_R, rho_L, rho_R, rho_cell, T_L, T_R, Cp_L, Cp_R, hmix_L, hmix_R, dh_dxi, lambda_L, lambda_R, lambda_Cell, diffusivity_L, diffusivity_R, diffusivity_cell, Mass_Diffu_Energy]', copyin='[offsets]')
+                do z = isc3%beg, isc3%end
+                    do y = isc2%beg, isc2%end
+                        do x = isc1%beg, isc1%end
+                            ! Calculate grid spacing using direction-based indexing
+                            select case (idir)
+                            case (1)
+                                grid_spacing = x_cc(x + 1) - x_cc(x)
+                            case (2)
+                                grid_spacing = y_cc(y + 1) - y_cc(y)
+                            case (3)
+                                grid_spacing = z_cc(z + 1) - z_cc(z)
+                            end select
+
+                            ! Extract species mass fractions
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = chemxb, chemxe
+                                Ys_L(i - chemxb + 1) = q_prim_qp(i)%sf(x, y, z)
+                                Ys_R(i - chemxb + 1) = q_prim_qp(i)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+                                Ys_cell(i - chemxb + 1) = 0.5_wp*(Ys_L(i - chemxb + 1) + Ys_R(i - chemxb + 1))
+                            end do
+
+                            ! Calculate molecular weights and mole fractions
+                            call get_mixture_molecular_weight(Ys_L, MW_L)
+                            call get_mixture_molecular_weight(Ys_R, MW_R)
+                            MW_cell = 0.5_wp*(MW_L + MW_R)
+
+                            ! Calculate gas constants and thermodynamic properties
+                            Rgas_L = gas_constant/MW_L
+                            Rgas_R = gas_constant/MW_R
+
+                            P_L = q_prim_qp(E_idx)%sf(x, y, z)
+                            P_R = q_prim_qp(E_idx)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+
+                            rho_L = q_prim_qp(1)%sf(x, y, z)
+                            rho_R = q_prim_qp(1)%sf(x + offsets(1), y + offsets(2), z + offsets(3))
+
+                            T_L = P_L/rho_L/Rgas_L
+                            T_R = P_R/rho_R/Rgas_R
+
+                            rho_cell = 0.5_wp*(rho_L + rho_R)
+
+                            call get_mixture_specific_heat_cp_mass(T_L, Ys_L, Cp_L)
+                            call get_mixture_specific_heat_cp_mass(T_R, Ys_R, Cp_R)
+                            call get_mixture_enthalpy_mass(T_L, Ys_L, hmix_L)
+                            call get_mixture_enthalpy_mass(T_R, Ys_R, hmix_R)
+                            dh_dxi = (hmix_R - hmix_L)/grid_spacing
+
+                            ! Get transport properties
+                            call get_mixture_thermal_conductivity_mixavg(T_L, Ys_L, lambda_L)
+                            call get_mixture_thermal_conductivity_mixavg(T_R, Ys_R, lambda_R)
+
+                            ! Calculate species properties and gradients
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = chemxb, chemxe
+                                dYk_dxi(i - chemxb + 1) = (Ys_R(i - chemxb + 1) - &
+                                                           Ys_L(i - chemxb + 1))/grid_spacing
+                            end do
+
+                            ! Calculate mixture-averaged diffusivities
+                            diffusivity_L = lambda_L/rho_L/Cp_L
+                            diffusivity_R = lambda_R/rho_R/Cp_R
+
+                            lambda_Cell = 0.5_wp*(lambda_R + lambda_L)
+                            diffusivity_cell = 0.5_wp*(diffusivity_R + diffusivity_L)
+
+                            ! Calculate mass diffusion fluxes
+                            Mass_Diffu_Energy = 0.0_wp
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do eqn = chemxb, chemxe
+                                Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell* &
+                                                                    diffusivity_cell* &
+                                                                    dYk_dxi(eqn - chemxb + 1)
+                            end do
+                            Mass_Diffu_Energy = rho_cell*diffusivity_cell*dh_dxi
+
+                            ! Update flux arrays
+                            flux_src_vf(E_idx)%sf(x, y, z) = flux_src_vf(E_idx)%sf(x, y, z) - Mass_Diffu_Energy
+
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do eqn = chemxb, chemxe
+                                flux_src_vf(eqn)%sf(x, y, z) = flux_src_vf(eqn)%sf(x, y, z) - Mass_Diffu_Flux(eqn - chemxb + 1)
                             end do
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:endblock UNDEF_AMD
+                end do
+                $:END_GPU_PARALLEL_LOOP()
             end if
         end if
 
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index 39e5c7800c..e4585d011a 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -245,6 +245,7 @@ module m_global_parameters
     logical :: E_wrt
     logical, dimension(num_fluids_max) :: alpha_rho_e_wrt
     logical :: fft_wrt
+    logical :: dummy
     logical :: pres_wrt
     logical, dimension(num_fluids_max) :: alpha_wrt
     logical :: gamma_wrt
@@ -475,6 +476,7 @@ contains
         file_per_process = .false.
         E_wrt = .false.
         fft_wrt = .false.
+        dummy = .false.
         pres_wrt = .false.
         alpha_wrt = .false.
         gamma_wrt = .false.
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index ea679ba6f5..8380f5bb02 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -300,6 +300,7 @@ module m_global_parameters
     !! to the next time-step.
 
     logical :: fft_wrt
+    logical :: dummy 
 
 contains
 
@@ -409,6 +410,7 @@ contains
         elliptic_smoothing = .false.
 
         fft_wrt = .false.
+        dummy = .false.
 
         simplex_perturb = .false.
         simplex_params%perturb_vel(:) = .false.
diff --git a/src/simulation/include/inline_capillary.fpp b/src/simulation/include/inline_capillary.fpp
index ccd451f7c6..89e1aabaec 100644
--- a/src/simulation/include/inline_capillary.fpp
+++ b/src/simulation/include/inline_capillary.fpp
@@ -1,11 +1,12 @@
 #:def compute_capillary_stress_tensor()
 
     Omega(1, 1) = -sigma*(w2*w2 + w3*w3)/normW
+    #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+        Omega(2, 1) = sigma*w1*w2/normW
+        Omega(1, 2) = Omega(2, 1)
 
-    Omega(2, 1) = sigma*w1*w2/normW
-    Omega(1, 2) = Omega(2, 1)
-
-    Omega(2, 2) = -sigma*(w1*w1 + w3*w3)/normW
+        Omega(2, 2) = -sigma*(w1*w1 + w3*w3)/normW
+    #:endif
 
     if (p > 0) then
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index b5027b9fd1..0950072042 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -37,9 +37,9 @@ module m_cbc
         molecular_weights, get_species_specific_heats_r, &
         get_mole_fractions, get_species_specific_heats_r
 
-    #:block DEF_AMD
+    #:if USING_AMD
         use m_chemistry, only: molecular_weights_nonparameter
-    #:endblock DEF_AMD
+    #:endif
     implicit none
 
     private; public :: s_initialize_cbc_module, s_cbc, s_finalize_cbc_module
@@ -700,7 +700,7 @@ contains
             if (cbc_dir == ${CBC_DIR}$ .and. recon_type == WENO_TYPE) then
 
                 ! PI2 of flux_rs_vf and flux_src_rs_vf at j = 1/2
-                if (weno_order == 3) then
+                if (weno_order == 3 .or. dummy) then
 
                     call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, &
                                                                F_rs${XYZ}$_vf, &
@@ -732,9 +732,10 @@ contains
                         end do
                     end do
                     $:END_GPU_PARALLEL_LOOP()
+                end if
 
                     ! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2
-                else
+                if(weno_order == 5 .or. dummy) then
                     call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, &
                                                                F_rs${XYZ}$_vf, &
                                                                F_src_rs${XYZ}$_vf, &
@@ -1059,15 +1060,14 @@ contains
                             sum_Enthalpies = 0._wp
                             $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
-                                #:block UNDEF_AMD
-                                    h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
-                                    sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
-                                #:endblock UNDEF_AMD
-
-                                #:block DEF_AMD
+                                
+                                #:if USING_AMD
                                     h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T
                                     sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i)
-                                #:endblock DEF_AMD
+                                #:else
+                                    h_k(i) = h_k(i)*gas_constant/molecular_weights(i)*T
+                                    sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights(i)*Cp/R_gas)*dYs_dt(i)
+                                #:endif
                             end do
                             flux_rs${XYZ}$_vf_l(-1, k, r, E_idx) = flux_rs${XYZ}$_vf_l(0, k, r, E_idx) &
                                                                    + ds(0)*((E/rho + pres/rho)*drho_dt + rho*vel_dv_dt_sum + Cp*T*L(2)/(c*c) + sum_Enthalpies)
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index d2fee92764..d239532f43 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -2,6 +2,7 @@
 !! @brief Contains module m_data_output
 
 #:include 'macros.fpp'
+#:include 'case.fpp'
 
 !> @brief The primary purpose of this module is to output the grid and the
 !!              conservative variables data at the chosen time-step interval. In
@@ -267,10 +268,15 @@ contains
         integer, intent(in) :: t_step
 
         real(wp) :: rho        !< Cell-avg. density
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(2) :: alpha      !< Cell-avg. volume fraction
+        real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+#:else
+        real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
         real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
+#:endif
         real(wp) :: vel_sum    !< Cell-avg. velocity sum
         real(wp) :: pres       !< Cell-avg. pressure
-        real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
         real(wp) :: gamma      !< Cell-avg. sp. heat ratio
         real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
         real(wp) :: qv         !< Cell-avg. internal energy reference value
@@ -320,7 +326,7 @@ contains
         #:call GPU_PARALLEL(copyout='[icfl_max_loc]', copyin='[icfl_sf]')
             icfl_max_loc = maxval(icfl_sf)
         #:endcall GPU_PARALLEL
-        if (viscous) then
+        if (viscous .or. dummy) then
             #:call GPU_PARALLEL(copyout='[vcfl_max_loc, Rc_min_loc]', copyin='[vcfl_sf,Rc_sf]')
                 vcfl_max_loc = maxval(vcfl_sf)
                 Rc_min_loc = minval(Rc_sf)
@@ -1567,6 +1573,7 @@ contains
                     end if
                 elseif (p == 0) then
                     if (bubbles_euler) then
+                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                         write (i + 30, '(6X,10F24.8)') &
                             nondim_time, &
                             rho, &
@@ -1578,7 +1585,9 @@ contains
                             nRdot(1), &
                             R(1), &
                             Rdot(1)
+                        #:endif
                     else if (elasticity) then
+                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                         write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
                                'F24.8,F24.8,F24.8)') &
                             nondim_time, &
@@ -1589,6 +1598,7 @@ contains
                             tau_e(1), &
                             tau_e(2), &
                             tau_e(3)
+                        #:endif
                     else
                         write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
                             nondim_time, &
@@ -1598,6 +1608,7 @@ contains
                         print *, 'time =', nondim_time, 'rho =', rho, 'pres =', pres
                     end if
                 else
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                     write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
                            'F24.8,F24.8,F24.8,F24.8,F24.8,'// &
                            'F24.8)') &
@@ -1612,6 +1623,7 @@ contains
                         qv, &
                         c, &
                         accel
+                    #:endif
                 end if
             end if
         end do
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 50967956ae..122133f844 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -543,6 +543,7 @@ module m_global_parameters
     $:GPU_DECLARE(create='[Bx0,powell]')
 
     logical :: fft_wrt
+    logical :: dummy
 
     !> @name Continuum damage model parameters
     !> @{!
@@ -761,6 +762,7 @@ contains
         #:endfor
 
         fft_wrt = .false.
+        dummy = .false.
 
         do j = 1, num_probes_max
             acoustic(j)%pulse = dflt_int
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 21625069b2..76e0c74841 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -106,7 +106,7 @@ contains
         real(wp) :: G_local
         integer :: j, k, l, i, r
 
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_K, alpha_rho_K, rho, gamma, pi_inf, qv, G_local, Re, tensora, tensorb, i]')
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,alpha_K, alpha_rho_K, rho, gamma, pi_inf, qv, G_local, Re, tensora, tensorb]')
         do l = 0, p
             do k = 0, n
                 do j = 0, m
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 60555b554b..e99c9a6401 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -356,7 +356,7 @@ contains
 
             call s_populate_F_igr_buffers(bc_type, jac_sf)
 
-            if (igr_iter_solver == 1) then ! Jacobi iteration
+            if (igr_iter_solver == 1 .or. dummy) then ! Jacobi iteration
                 $:GPU_PARALLEL_LOOP(private='[j,k,l]', collapse=3)
                 do l = idwbuff(3)%beg, idwbuff(3)%end
                     do k = idwbuff(2)%beg, idwbuff(2)%end
@@ -487,6 +487,7 @@ contains
 
         if (idir == 1) then
             if (p == 0) then
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                 $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
                 do l = 0, p
                     do k = 0, n
@@ -901,6 +902,7 @@ contains
                     end do
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+                #:endif
             else
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                     $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
@@ -1418,6 +1420,7 @@ contains
             end if
         else if (idir == 2) then
             if (p == 0) then
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                 $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
                 do l = 0, p
                     do k = -1, n
@@ -1812,6 +1815,7 @@ contains
                     end do
                 end do
                 $:END_GPU_PARALLEL_LOOP()
+                #:endif
             else
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                     $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
diff --git a/src/simulation/m_muscl.fpp b/src/simulation/m_muscl.fpp
index c9d5b4bff0..91e32a7c6a 100644
--- a/src/simulation/m_muscl.fpp
+++ b/src/simulation/m_muscl.fpp
@@ -112,11 +112,11 @@ contains
 
         $:GPU_UPDATE(device='[is1_muscl,is2_muscl,is3_muscl]')
 
-        if (muscl_order /= 1) then
+        if (muscl_order /= 1 .or. dummy) then
             call s_initialize_muscl(v_vf, muscl_dir)
         end if
 
-        if (muscl_order == 1) then
+        if (muscl_order == 1 .or. dummy) then
             if (muscl_dir == 1) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
@@ -157,8 +157,9 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
             end if
+        end if
 
-        else if (muscl_order == 2) then
+        if (muscl_order == 2 .or. dummy) then
             ! MUSCL Reconstruction
             #:for MUSCL_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (muscl_dir == ${MUSCL_DIR}$) then
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 2882a26616..1c8eda1c30 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -659,7 +659,7 @@ contains
 
         call cpu_time(t_start)
 
-        if (.not. igr) then
+        if (.not. igr .or. dummy) then
             ! Association/Population of Working Variables
             $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
             do i = 1, sys_size
@@ -697,11 +697,12 @@ contains
             end if
         end if
 
-        if (igr) then
+        if (igr .or. dummy) then
             call nvtxStartRange("RHS-COMMUNICATION")
             call s_populate_variables_buffers(bc_type, q_cons_vf, pb_in, mv_in)
             call nvtxEndRange
-        else
+        end if
+        if(.not. igr .or. dummy) then
             call nvtxStartRange("RHS-CONVERT")
             call s_convert_conservative_to_primitive_variables( &
                 q_cons_qp%vf, &
@@ -727,7 +728,7 @@ contains
 
         if (qbmm) call s_mom_inv(q_cons_qp%vf, q_prim_qp%vf, mom_sp, mom_3d, pb_in, rhs_pb, mv_in, rhs_mv, idwbuff(1), idwbuff(2), idwbuff(3))
 
-        if (viscous .and. .not. igr) then
+        if ((viscous .and. .not. igr) .or. dummy) then
             call nvtxStartRange("RHS-VISCOUS")
             call s_get_viscous(qL_rsx_vf, qL_rsy_vf, qL_rsz_vf, &
                                dqL_prim_dx_n, dqL_prim_dy_n, dqL_prim_dz_n, &
@@ -750,7 +751,7 @@ contains
         ! Dimensional Splitting Loop
         do id = 1, num_dims
 
-            if (igr) then
+            if (igr .or. dummy) then
 
                 if (id == 1) then
                     $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
@@ -779,8 +780,8 @@ contains
                     call s_igr_sigma_x(q_cons_vf, rhs_vf)
                     call nvtxEndRange
                 end if
-
-            else ! Finite volume solve
+            end if 
+            if(.not. igr .or. dummy) then! Finite volume solve
 
                 ! Reconstructing Primitive/Conservative Variables
                 call nvtxStartRange("RHS-WENO")
@@ -1055,7 +1056,7 @@ contains
         ! END: Additional pphysics and source terms
 
         if (run_time_info .or. probe_wrt .or. ib .or. bubbles_lagrange) then
-            if (.not. igr) then
+            if (.not. igr .or. dummy) then
                 $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
                 do i = 1, sys_size
                     do l = idwbuff(3)%beg, idwbuff(3)%end
@@ -1632,7 +1633,7 @@ contains
             end if
 
             if (cyl_coord .and. ((bc_y%beg == -2) .or. (bc_y%beg == -14))) then
-                if (viscous) then
+                if (viscous .or. dummy) then
                     if (p > 0) then
                         call s_compute_viscous_stress_cylindrical_boundary(q_prim_vf, &
                                                                            dq_prim_dx_vf(mom_idx%beg:mom_idx%end), &
@@ -1741,7 +1742,7 @@ contains
                     end do
                     $:END_GPU_PARALLEL_LOOP()
 
-                    if (viscous) then
+                    if (viscous .or. dummy) then
                         $:GPU_PARALLEL_LOOP(private='[i,j,l]', collapse=2)
                         do l = 0, p
                             do j = 0, m
@@ -1875,7 +1876,7 @@ contains
         integer :: i, j, k, l
 
         #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$) then
+            if (recon_type == ${TYPE}$ .or. dummy) then
                 ! Reconstruction in s1-direction
                 if (norm_dir == 1) then
                     is1 = idwbuff(1); is2 = idwbuff(2); is3 = idwbuff(3)
@@ -1930,7 +1931,7 @@ contains
         ! Reconstruction in s1-direction
 
         #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl', 'MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$) then
+            if (recon_type == ${TYPE}$ .or. dummy) then
                 if (norm_dir == 1) then
                     is1 = idwbuff(1); is2 = idwbuff(2); is3 = idwbuff(3)
                     recon_dir = 1; is1%beg = is1%beg + ${SCHEME}$_polyn
@@ -1949,49 +1950,49 @@ contains
                 end if
 
                 $:GPU_UPDATE(device='[is1,is2,is3,iv]')
+            end if
+        #:endfor
 
-                if (recon_dir == 1) then
-                    $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
-                                    vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
-                                end do
-                            end do
+         if (recon_dir == 1) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
+                            vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                else if (recon_dir == 2) then
-                    $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
-                                    vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
-                                end do
-                            end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (recon_dir == 2) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
+                            vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                else if (recon_dir == 3) then
-                    $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
-                                    vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
-                                end do
-                            end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (recon_dir == 3) then
+            $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
+                            vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        #:endfor
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
 
     end subroutine s_reconstruct_cell_boundary_values_first_order
 
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index 7e1f67d2d4..fc5cefadaf 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -987,7 +987,7 @@ contains
 
         #:endfor
 
-        if (viscous) then
+        if (viscous .or. dummy) then
             if (weno_Re_flux) then
 
                 call s_compute_viscous_source_flux( &
@@ -1288,6 +1288,7 @@ contains
                                 H_L = (E_L + pres_L)/rho_L
                                 H_R = (E_R + pres_R)/rho_R
                             elseif (mhd .and. relativity) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
                                 Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
                                 vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
@@ -1310,6 +1311,7 @@ contains
 
                                 E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
                                 E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
+                                #:endif
                             elseif (mhd .and. .not. relativity) then
                                 pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
                                 pres_mag%R = 0.5_wp*(B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp)
@@ -1496,11 +1498,13 @@ contains
                             ! Energy
                             if (mhd .and. (.not. relativity)) then
                                 ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
                                     (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
                                      - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
                                      + s_M*s_P*(E_L - E_R)) &
                                     /(s_M - s_P)
+                                #:endif
                             elseif (mhd .and. relativity) then
                                 ! energy flux = m_${XYZ}$ - mass flux
                                 ! Hard-coded for single-component for now
@@ -1648,7 +1652,7 @@ contains
 
         #:endfor
 
-        if (viscous) then
+        if (viscous .or. dummy) then
             $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l, idx_right_phys, vel_grad_L, vel_grad_R, alpha_L, alpha_R, vel_L, vel_R, Re_L, Re_R]', copyin='[norm_dir]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
@@ -1721,25 +1725,31 @@ contains
                             do i = 1, num_dims
                                 vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
                                 vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 if (num_dims > 1) then
                                     vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
                                     vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
                                 end if
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 if (num_dims > 2) then
                                     vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
                                     vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
                                 end if
+                                #:endif
+                                #:endif
                             end do
 
                             if (norm_dir == 1) then
                                 flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 if (num_dims > 1) then
                                     flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
 
                                     flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(2) + vel_grad_R(1, 2)*vel_R(2)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(2) + vel_grad_R(2, 1)*vel_R(2))
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                     if (num_dims > 2) then
                                         flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
                                         flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
@@ -1747,9 +1757,12 @@ contains
                                         flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
                                         flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(3) + vel_grad_R(1, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(3) + vel_grad_R(3, 1)*vel_R(3))
                                     end if
+                                    #:endif
                                 end if
+                                #:endif
 
                             else if (norm_dir == 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
 
@@ -1758,6 +1771,7 @@ contains
 
                                 flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(1) + vel_grad_R(1, 2)*vel_R(1)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(1) + vel_grad_R(2, 1)*vel_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 if (num_dims > 2) then
                                     flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
@@ -1765,7 +1779,10 @@ contains
                                     flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(3) + vel_grad_R(2, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(3) + vel_grad_R(3, 2)*vel_R(3))
                                 end if
+                                #:endif 
+                                #:endif
                             else
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
 
@@ -1780,7 +1797,7 @@ contains
 
                                 flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(2) + vel_grad_R(2, 3)*vel_R(2)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(2) + vel_grad_R(3, 2)*vel_R(2))
-
+                                #:endif
                             end if
                         end if
 
@@ -1790,41 +1807,54 @@ contains
                             do i = 1, num_dims
                                 vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
                                 vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 if (num_dims > 1) then
                                     vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
                                     vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
                                 end if
+                                #:endif
+                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 if (num_dims > 2) then
                                     vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
                                     vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
                                 end if
+                                #:endif
                             end do
 
                             if (norm_dir == 1) then
                                 flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 if (num_dims > 1) then
                                     flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
 
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                     if (num_dims > 2) then
                                         flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
                                         flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
                                     end if
+                                    #:endif
                                 end if
+                                #:endif
 
                             else if (norm_dir == 2) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
 
                                 flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
 
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 if (num_dims > 2) then
                                     flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
                                     flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
                                 end if
+                                #:endif 
+                                #:endif
                             else
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                                 flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
 
@@ -1833,7 +1863,7 @@ contains
 
                                 flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
-
+                                #:endif
                             end if
 
                         end if
@@ -1912,13 +1942,20 @@ contains
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(2) :: alpha_rho_L, alpha_rho_R
+        real(wp), dimension(2) :: alpha_L, alpha_R
+        real(wp), dimension(3) :: vel_L, vel_R
+#:else
         real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp) :: rho_L, rho_R
+        real(wp), dimension(num_fluids) :: alpha_L, alpha_R
         real(wp), dimension(num_dims) :: vel_L, vel_R
+#:endif
+        
+        real(wp) :: rho_L, rho_R        
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
-        real(wp), dimension(num_fluids) :: alpha_L, alpha_R
         real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
         real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
         real(wp) :: Cp_avg, Cv_avg, T_avg, c_sum_Yi_Phi, eps
@@ -1945,11 +1982,17 @@ contains
         real(wp) :: xi_L, xi_R !< Left and right wave speeds functions
         real(wp) :: xi_M, xi_P
         real(wp) :: xi_MP, xi_PP
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(3) :: R0_L, R0_R
+        real(wp), dimension(3) :: V0_L, V0_R
+        real(wp), dimension(3) :: P0_L, P0_R
+        real(wp), dimension(3) :: pbw_L, pbw_R
+#:else
         real(wp), dimension(nb) :: R0_L, R0_R
         real(wp), dimension(nb) :: V0_L, V0_R
         real(wp), dimension(nb) :: P0_L, P0_R
         real(wp), dimension(nb) :: pbw_L, pbw_R
+#:endif
 
         real(wp) :: alpha_L_sum, alpha_R_sum, nbub_L, nbub_R
         real(wp) :: ptilde_L, ptilde_R
@@ -1959,7 +2002,11 @@ contains
         real(wp) :: R3V2Lbar, R3V2Rbar
 
         real(wp), dimension(6) :: tau_e_L, tau_e_R
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: xi_field_L, xi_field_R
+#:else
         real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
+#:endif
         real(wp) :: G_L, G_R
 
         real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
@@ -3564,7 +3611,7 @@ contains
         #:endfor
         ! Computing HLLC flux and source flux for Euler system of equations
 
-        if (viscous) then
+        if (viscous .or. dummy) then
             if (weno_Re_flux) then
                 call s_compute_viscous_source_flux( &
                     qL_prim_vf(momxb:momxe), &
@@ -3671,181 +3718,183 @@ contains
 
         #:for NORM_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
             if (norm_dir == ${NORM_DIR}$) then
-                #:block UNDEF_AMD
-                    $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s_L, s_R, s_M, s_starL, s_starR, pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR, sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx, vL_star, vR_star, wL_star, wR_star, v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double]', copyin='[norm_dir]')
-                    do l = is3%beg, is3%end
-                        do k = is2%beg, is2%end
-                            do j = is1%beg, is1%end
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_rho_L, alpha_rho_R, vel, alpha_L, alpha_R, rho, pres,E, H_no_mag, gamma, pi_inf, qv, vel_rms, B, c, c_fast, pres_mag, U_L, U_R, U_starL, U_starR, U_doubleL, U_doubleR, F_L, F_R, F_starL, F_starR, F_hlld, s_L, s_R, s_M, s_starL, s_starR, pTot_L, pTot_R, p_star, rhoL_star, rhoR_star, E_starL, E_starR, sqrt_rhoL_star, sqrt_rhoR_star, denom_ds, sign_Bx, vL_star, vR_star, wL_star, wR_star, v_double, w_double, By_double, Bz_double, E_doubleL, E_doubleR, E_double]', copyin='[norm_dir]')
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
 
-                                ! (1) Extract the left/right primitive states
-                                do i = 1, contxe
-                                    alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
-                                    alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
-                                end do
+                            ! (1) Extract the left/right primitive states
+                            do i = 1, contxe
+                                alpha_rho_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, i)
+                                alpha_rho_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, i)
+                            end do
 
-                                ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic
-                                do i = 1, num_vels
-                                    vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i))
-                                    vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i))
-                                end do
+                            ! NOTE: unlike HLL & HLLC, vel_L here is permutated by dir_idx for simpler logic
+                            do i = 1, num_vels
+                                vel%L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(i))
+                                vel%R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, contxe + dir_idx(i))
+                            end do
 
-                                vel_rms%L = sum(vel%L**2._wp)
-                                vel_rms%R = sum(vel%R**2._wp)
+                            vel_rms%L = sum(vel%L**2._wp)
+                            vel_rms%R = sum(vel%R**2._wp)
 
-                                do i = 1, num_fluids
-                                    alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
-                                    alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
-                                end do
+                            do i = 1, num_fluids
+                                alpha_L(i) = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx + i)
+                                alpha_R(i) = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx + i)
+                            end do
 
-                                pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
-                                pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
+                            pres%L = qL_prim_rs${XYZ}$_vf(j, k, l, E_idx)
+                            pres%R = qR_prim_rs${XYZ}$_vf(j + 1, k, l, E_idx)
 
-                                ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic
-                                if (mhd) then
-                                    if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated
-                                        B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)]
-                                        B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)]
-                                    else ! 2D/3D: Bx, By, Bz as variables
-                                        B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), &
-                                               qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), &
-                                               qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)]
-                                        B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), &
-                                               qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), &
-                                               qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)]
-                                    end if
+                            ! NOTE: unlike HLL, Bx, By, Bz are permutated by dir_idx for simpler logic
+                            if (mhd) then
+                                if (n == 0) then ! 1D: constant Bx; By, Bz as variables; only in x so not permutated
+                                    B%L = [Bx0, qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg), qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1)]
+                                    B%R = [Bx0, qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg), qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + 1)]
+                                else ! 2D/3D: Bx, By, Bz as variables
+                                    B%L = [qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(1) - 1), &
+                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1), &
+                                           qL_prim_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1)]
+                                    B%R = [qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(1) - 1), &
+                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(2) - 1), &
+                                           qR_prim_rs${XYZ}$_vf(j + 1, k, l, B_idx%beg + dir_idx(3) - 1)]
                                 end if
+                            end if
 
-                                ! Sum properties of all fluid components
-                                rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
-                                rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    rho%L = rho%L + alpha_rho_L(i)
-                                    gamma%L = gamma%L + alpha_L(i)*gammas(i)
-                                    pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i)
-                                    qv%L = qv%L + alpha_rho_L(i)*qvs(i)
-
-                                    rho%R = rho%R + alpha_rho_R(i)
-                                    gamma%R = gamma%R + alpha_R(i)*gammas(i)
-                                    pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i)
-                                    qv%R = qv%R + alpha_rho_R(i)*qvs(i)
-                                end do
-
-                                pres_mag%L = 0.5_wp*sum(B%L**2._wp)
-                                pres_mag%R = 0.5_wp*sum(B%R**2._wp)
-                                E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L
-                                E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy
-                                H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L
-                                H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
-
-                                ! (2) Compute fast wave speeds
-                                call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L, qv%L)
-                                call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R, qv%R)
-                                call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L)
-                                call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R)
-
-                                ! (3) Compute contact speed s_M [Miyoshi Equ. (38)]
-                                s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R)
-                                s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L)
-
-                                pTot_L = pres%L + pres_mag%L
-                                pTot_R = pres%R + pres_mag%R
-
-                                s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - &
-                                        (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ &
-                                       ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L))
-
-                                ! (4) Compute star state variables
-                                rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M)
-                                rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M)
-                                p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M)
-                                E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M)
-                                E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M)
-
-                                ! (5) Compute left/right state vectors and fluxes
-                                U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
-                                U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL]
-                                U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
-                                U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR]
-
-                                ! Compute the left/right fluxes
-                                F_L(1) = U_L(2)
-                                F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L
-                                F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
-                                F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
-                                F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
-
-                                F_R(1) = U_R(2)
-                                F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R
-                                F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
-                                F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
-                                F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
-                                ! Compute the star flux using HLL relation
-                                F_starL = F_L + s_L*(U_starL - U_L)
-                                F_starR = F_R + s_R*(U_starR - U_R)
-                                ! Compute the rotational (Alfvén) speeds
-                                s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star)
-                                s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star)
-                                ! Compute the double–star states [Miyoshi Eqns. (59)-(62)]
-                                sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star)
-                                vL_star = vel%L(2); wL_star = vel%L(3)
-                                vR_star = vel%R(2); wR_star = vel%R(3)
-
-                                ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)]
-                                denom_ds = sqrt_rhoL_star + sqrt_rhoR_star
-                                sign_Bx = sign(1._wp, B%L(1))
-                                v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds
-                                w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds
-                                By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds
-                                Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds
-
-                                E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
-                                E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
-                                E_double = 0.5_wp*(E_doubleL + E_doubleR)
-
-                                U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double]
-                                U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double]
-
-                                ! (11) Choose HLLD flux based on wave-speed regions
-                                if (0.0_wp <= s_L) then
-                                    F_hlld = F_L
-                                else if (0.0_wp <= s_starL) then
-                                    F_hlld = F_L + s_L*(U_starL - U_L)
-                                else if (0.0_wp <= s_M) then
-                                    F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
-                                else if (0.0_wp <= s_starR) then
-                                    F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
-                                else if (0.0_wp <= s_R) then
-                                    F_hlld = F_R + s_R*(U_starR - U_R)
-                                else
-                                    F_hlld = F_R
-                                end if
+                            ! Sum properties of all fluid components
+                            rho%L = 0._wp; gamma%L = 0._wp; pi_inf%L = 0._wp; qv%L = 0._wp
+                            rho%R = 0._wp; gamma%R = 0._wp; pi_inf%R = 0._wp; qv%R = 0._wp
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                rho%L = rho%L + alpha_rho_L(i)
+                                gamma%L = gamma%L + alpha_L(i)*gammas(i)
+                                pi_inf%L = pi_inf%L + alpha_L(i)*pi_infs(i)
+                                qv%L = qv%L + alpha_rho_L(i)*qvs(i)
+
+                                rho%R = rho%R + alpha_rho_R(i)
+                                gamma%R = gamma%R + alpha_R(i)*gammas(i)
+                                pi_inf%R = pi_inf%R + alpha_R(i)*pi_infs(i)
+                                qv%R = qv%R + alpha_rho_R(i)*qvs(i)
+                            end do
 
-                                ! (12) Reorder and write temporary variables to the flux array
-                                ! Mass
-                                flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component
-                                ! Momentum
-                                flux_rs${XYZ}$_vf(j, k, l, [contxe + dir_idx(1), contxe + dir_idx(2), contxe + dir_idx(3)]) = F_hlld([2, 3, 4])
-                                ! Magnetic field
-                                if (n == 0) then
-                                    flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg, B_idx%beg + 1]) = F_hlld([5, 6])
-                                else
-                                    flux_rs${XYZ}$_vf(j, k, l, [B_idx%beg + dir_idx(2) - 1, B_idx%beg + dir_idx(3) - 1]) = F_hlld([5, 6])
-                                end if
-                                ! Energy
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
-                                ! Partial fraction
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = advxb, advxe
-                                    flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
-                                end do
+                            pres_mag%L = 0.5_wp*sum(B%L**2._wp)
+                            pres_mag%R = 0.5_wp*sum(B%R**2._wp)
+                            E%L = gamma%L*pres%L + pi_inf%L + 0.5_wp*rho%L*vel_rms%L + qv%L + pres_mag%L
+                            E%R = gamma%R*pres%R + pi_inf%R + 0.5_wp*rho%R*vel_rms%R + qv%R + pres_mag%R ! includes magnetic energy
+                            H_no_mag%L = (E%L + pres%L - pres_mag%L)/rho%L
+                            H_no_mag%R = (E%R + pres%R - pres_mag%R)/rho%R ! stagnation enthalpy here excludes magnetic energy (only used to find speed of sound)
+
+                            ! (2) Compute fast wave speeds
+                            call s_compute_speed_of_sound(pres%L, rho%L, gamma%L, pi_inf%L, H_no_mag%L, alpha_L, vel_rms%L, 0._wp, c%L, qv%L)
+                            call s_compute_speed_of_sound(pres%R, rho%R, gamma%R, pi_inf%R, H_no_mag%R, alpha_R, vel_rms%R, 0._wp, c%R, qv%R)
+                            call s_compute_fast_magnetosonic_speed(rho%L, c%L, B%L, norm_dir, c_fast%L, H_no_mag%L)
+                            call s_compute_fast_magnetosonic_speed(rho%R, c%R, B%R, norm_dir, c_fast%R, H_no_mag%R)
+
+                            ! (3) Compute contact speed s_M [Miyoshi Equ. (38)]
+                            s_L = min(vel%L(1) - c_fast%L, vel%R(1) - c_fast%R)
+                            s_R = max(vel%R(1) + c_fast%R, vel%L(1) + c_fast%L)
+
+                            pTot_L = pres%L + pres_mag%L
+                            pTot_R = pres%R + pres_mag%R
+
+                            s_M = (((s_R - vel%R(1))*rho%R*vel%R(1) - &
+                                    (s_L - vel%L(1))*rho%L*vel%L(1) - pTot_R + pTot_L)/ &
+                                   ((s_R - vel%R(1))*rho%R - (s_L - vel%L(1))*rho%L))
+
+                            ! (4) Compute star state variables
+                            rhoL_star = rho%L*(s_L - vel%L(1))/(s_L - s_M)
+                            rhoR_star = rho%R*(s_R - vel%R(1))/(s_R - s_M)
+                            p_star = pTot_L + rho%L*(s_L - vel%L(1))*(s_M - vel%L(1))/(s_L - s_M)
+                            E_starL = ((s_L - vel%L(1))*E%L - pTot_L*vel%L(1) + p_star*s_M)/(s_L - s_M)
+                            E_starR = ((s_R - vel%R(1))*E%R - pTot_R*vel%R(1) + p_star*s_M)/(s_R - s_M)
+
+                            ! (5) Compute left/right state vectors and fluxes
+                            U_L = [rho%L, rho%L*vel%L(1:3), B%L(2:3), E%L]
+                            U_starL = [rhoL_star, rhoL_star*s_M, rhoL_star*vel%L(2:3), B%L(2:3), E_starL]
+                            U_R = [rho%R, rho%R*vel%R(1:3), B%R(2:3), E%R]
+                            U_starR = [rhoR_star, rhoR_star*s_M, rhoR_star*vel%R(2:3), B%R(2:3), E_starR]
+
+                            ! Compute the left/right fluxes
+                            F_L(1) = U_L(2)
+                            F_L(2) = U_L(2)*vel%L(1) - B%L(1)*B%L(1) + pTot_L
+                            F_L(3:4) = U_L(2)*vel%L(2:3) - B%L(1)*B%L(2:3)
+                            F_L(5:6) = vel%L(1)*B%L(2:3) - vel%L(2:3)*B%L(1)
+                            F_L(7) = (E%L + pTot_L)*vel%L(1) - B%L(1)*(vel%L(1)*B%L(1) + vel%L(2)*B%L(2) + vel%L(3)*B%L(3))
+
+                            F_R(1) = U_R(2)
+                            F_R(2) = U_R(2)*vel%R(1) - B%R(1)*B%R(1) + pTot_R
+                            F_R(3:4) = U_R(2)*vel%R(2:3) - B%R(1)*B%R(2:3)
+                            F_R(5:6) = vel%R(1)*B%R(2:3) - vel%R(2:3)*B%R(1)
+                            F_R(7) = (E%R + pTot_R)*vel%R(1) - B%R(1)*(vel%R(1)*B%R(1) + vel%R(2)*B%R(2) + vel%R(3)*B%R(3))
+                            ! Compute the star flux using HLL relation
+                            F_starL = F_L + s_L*(U_starL - U_L)
+                            F_starR = F_R + s_R*(U_starR - U_R)
+                            ! Compute the rotational (Alfvén) speeds
+                            s_starL = s_M - abs(B%L(1))/sqrt(rhoL_star)
+                            s_starR = s_M + abs(B%L(1))/sqrt(rhoR_star)
+                            ! Compute the double–star states [Miyoshi Eqns. (59)-(62)]
+                            sqrt_rhoL_star = sqrt(rhoL_star); sqrt_rhoR_star = sqrt(rhoR_star)
+                            vL_star = vel%L(2); wL_star = vel%L(3)
+                            vR_star = vel%R(2); wR_star = vel%R(3)
+
+                            ! (6) Compute the double–star states [Miyoshi Eqns. (59)-(62)]
+                            denom_ds = sqrt_rhoL_star + sqrt_rhoR_star
+                            sign_Bx = sign(1._wp, B%L(1))
+                            v_double = (sqrt_rhoL_star*vL_star + sqrt_rhoR_star*vR_star + (B%R(2) - B%L(2))*sign_Bx)/denom_ds
+                            w_double = (sqrt_rhoL_star*wL_star + sqrt_rhoR_star*wR_star + (B%R(3) - B%L(3))*sign_Bx)/denom_ds
+                            By_double = (sqrt_rhoL_star*B%R(2) + sqrt_rhoR_star*B%L(2) + sqrt_rhoL_star*sqrt_rhoR_star*(vR_star - vL_star)*sign_Bx)/denom_ds
+                            Bz_double = (sqrt_rhoL_star*B%R(3) + sqrt_rhoR_star*B%L(3) + sqrt_rhoL_star*sqrt_rhoR_star*(wR_star - wL_star)*sign_Bx)/denom_ds
+
+                            E_doubleL = E_starL - sqrt_rhoL_star*((vL_star*B%L(2) + wL_star*B%L(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
+                            E_doubleR = E_starR + sqrt_rhoR_star*((vR_star*B%R(2) + wR_star*B%R(3)) - (v_double*By_double + w_double*Bz_double))*sign_Bx
+                            E_double = 0.5_wp*(E_doubleL + E_doubleR)
+
+                            U_doubleL = [rhoL_star, rhoL_star*s_M, rhoL_star*v_double, rhoL_star*w_double, By_double, Bz_double, E_double]
+                            U_doubleR = [rhoR_star, rhoR_star*s_M, rhoR_star*v_double, rhoR_star*w_double, By_double, Bz_double, E_double]
+
+                            ! (11) Choose HLLD flux based on wave-speed regions
+                            if (0.0_wp <= s_L) then
+                                F_hlld = F_L
+                            else if (0.0_wp <= s_starL) then
+                                F_hlld = F_L + s_L*(U_starL - U_L)
+                            else if (0.0_wp <= s_M) then
+                                F_hlld = F_starL + s_starL*(U_doubleL - U_starL)
+                            else if (0.0_wp <= s_starR) then
+                                F_hlld = F_starR + s_starR*(U_doubleR - U_starR)
+                            else if (0.0_wp <= s_R) then
+                                F_hlld = F_R + s_R*(U_starR - U_R)
+                            else
+                                F_hlld = F_R
+                            end if
 
-                                flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
+                            ! (12) Reorder and write temporary variables to the flux array
+                            ! Mass
+                            flux_rs${XYZ}$_vf(j, k, l, 1) = F_hlld(1) ! TODO multi-component
+                            ! Momentum
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(1)) = F_hlld(2)
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(2)) = F_hlld(3)
+                            flux_rs${XYZ}$_vf(j, k, l, contxe + dir_idx(3)) = F_hlld(4)
+                            ! Magnetic field
+                            if (n == 0) then
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg) = F_hlld(5)
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + 1) = F_hlld(6)
+                            else
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(2) - 1) = F_hlld(5)
+                                flux_rs${XYZ}$_vf(j, k, l, B_idx%beg + dir_idx(3) - 1) = F_hlld(6)
+                            end if
+                            ! Energy
+                            flux_rs${XYZ}$_vf(j, k, l, E_idx) = F_hlld(7)
+                            ! Partial fraction
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = advxb, advxe
+                                flux_rs${XYZ}$_vf(j, k, l, i) = 0._wp ! TODO multi-component (zero for now)
                             end do
+
+                            flux_src_rs${XYZ}$_vf(j, k, l, advxb) = 0._wp
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                #:endblock UNDEF_AMD
+                end do
+                $:END_GPU_PARALLEL_LOOP()
             end if
         #:endfor
 
@@ -4059,7 +4108,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do l = isz%beg, isz%end
@@ -4118,7 +4167,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
 
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
@@ -4181,7 +4230,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
 
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
@@ -4235,7 +4284,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
 
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
@@ -4292,7 +4341,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
@@ -4340,7 +4389,7 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
 
-                if (viscous) then
+                if (viscous .or. dummy) then
                     $:GPU_PARALLEL_LOOP(collapse=3)
                     do i = momxb, momxe
                         do k = isy%beg, isy%end
@@ -4414,7 +4463,7 @@ contains
 
         if (norm_dir == 1) then
 
-            if (viscous .or. (surface_tension)) then
+            if (viscous .or. (surface_tension) .or. dummy) then
 
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
@@ -4462,7 +4511,7 @@ contains
             ! Reshaping Inputted Data in y-direction
         elseif (norm_dir == 2) then
 
-            if (viscous .or. (surface_tension)) then
+            if (viscous .or. (surface_tension) .or. dummy) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do l = is3%beg, is3%end
@@ -4509,7 +4558,7 @@ contains
             ! Reshaping Inputted Data in z-direction
         else
 
-            if (viscous .or. (surface_tension)) then
+            if (viscous .or. (surface_tension) .or. dummy) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = momxb, E_idx
                     do j = is1%beg, is1%end
@@ -4658,12 +4707,14 @@ contains
                     end select
 
                     ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl)
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                     divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff
                     if (num_dims > 2) then
                         #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
                             divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff
                         #:endif
                     end if
+                    #:endif
 
                     stress_vector_shear = 0.0_wp
                     stress_normal_bulk = 0.0_wp
@@ -4675,7 +4726,9 @@ contains
                         case (1) ! X-face (axial normal, z_cyl)
                             stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
                             if (num_dims > 1) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
+                                #:endif
                             end if
                             if (num_dims > 2) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
@@ -4684,6 +4737,7 @@ contains
                             end if
                         case (2) ! Y-face (radial normal, r_cyl)
                             if (num_dims > 1) then
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                                 stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
                                 stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const
                                 if (num_dims > 2) then
@@ -4691,6 +4745,7 @@ contains
                                         stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
                                     #:endif
                                 end if
+                                #:endif
                             else
                                 stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
                             end if
@@ -4790,8 +4845,10 @@ contains
                         vel_grad_avg(vel_comp_idx, 1) = 0.5_wp*(dvelL_dx_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
                                                                 dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
                         if (num_dims > 1) then
+                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
                             vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
                                                                     dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
+                            #:endif
                         end if
                         if (num_dims > 2) then
                             #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 0aa632d3ef..873a614cbb 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -97,14 +97,22 @@ contains
             & cray_inline=True)
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), intent(inout), dimension(2) :: alpha
+        real(wp), intent(inout), dimension(3) :: vel
+#:else 
         real(wp), intent(inout), dimension(num_fluids) :: alpha
         real(wp), intent(inout), dimension(num_vels) :: vel
+#:endif
         real(wp), intent(inout) :: rho, gamma, pi_inf, vel_sum, H, pres
         real(wp), intent(out) :: qv
         integer, intent(in) :: j, k, l
         real(wp), dimension(2), intent(inout) :: Re
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(2) :: alpha_rho, Gs
+#:else
         real(wp), dimension(num_fluids) :: alpha_rho, Gs
+#:endif
         real(wp) :: E, G_local
 
         integer :: i
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index e6f311cf69..86284622c6 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1373,9 +1373,10 @@ contains
         ! Computation of parameters, allocation of memory, association of pointers,
         ! and/or execution of any other tasks that are needed to properly configure
         ! the modules. The preparations below DO DEPEND on the grid being complete.
-        if (igr) then
+        if (igr .or. dummy) then
             call s_initialize_igr_module()
-        else
+        end if 
+        if(.not. igr .or. dummy) then
             if (recon_type == WENO_TYPE) then
                 call s_initialize_weno_module()
             elseif (recon_type == MUSCL_TYPE) then
@@ -1534,14 +1535,14 @@ contains
         #:if not MFC_CASE_OPTIMIZATION
             $:GPU_UPDATE(device='[igr,nb,igr_order]')
         #:endif
-        #:block DEF_AMD
+        #:if USING_AMD
             block
                 use m_thermochem, only: molecular_weights
                 use m_chemistry, only: molecular_weights_nonparameter
                 molecular_weights_nonparameter(:) = molecular_weights(:)
                 $:GPU_UPDATE(device='[molecular_weights_nonparameter]')
             end block
-        #:endblock
+        #:endif
 
     end subroutine s_initialize_gpu_vars
 
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 86e187dca1..d51b67f659 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -134,7 +134,7 @@ contains
             $:END_GPU_PARALLEL_LOOP()
 
         elseif (id == 2) then
-
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]')
             do l = isz%beg, isz%end
                 do k = isy%beg, isy%end
@@ -178,6 +178,7 @@ contains
                 end do
             end do
             $:END_GPU_PARALLEL_LOOP()
+            #:endif
 
         elseif (id == 3) then
             #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
@@ -324,7 +325,7 @@ contains
         integer :: i, j, k, l
 
         #:for SCHEME, TYPE in [('weno', 'WENO_TYPE'),('muscl', 'MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$) then
+            if (recon_type == ${TYPE}$ .or. dummy) then
                 ! Reconstruction in s1-direction
 
                 if (norm_dir == 1) then
@@ -345,49 +346,49 @@ contains
                 end if
 
                 $:GPU_UPDATE(device='[is1,is2,is3,iv]')
+            end if
+        #:endfor
 
-                if (recon_dir == 1) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
-                                    vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
-                                end do
-                            end do
+        if (recon_dir == 1) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
+                            vR_x(j, k, l, i) = v_vf(i)%sf(j, k, l)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                else if (recon_dir == 2) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
-                                    vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
-                                end do
-                            end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (recon_dir == 2) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
+                            vR_y(j, k, l, i) = v_vf(i)%sf(k, j, l)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                else if (recon_dir == 3) then
-                    $:GPU_PARALLEL_LOOP(collapse=4)
-                    do i = iv%beg, iv%end
-                        do l = is3%beg, is3%end
-                            do k = is2%beg, is2%end
-                                do j = is1%beg, is1%end
-                                    vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
-                                    vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
-                                end do
-                            end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        else if (recon_dir == 3) then
+            $:GPU_PARALLEL_LOOP(collapse=4)
+            do i = iv%beg, iv%end
+                do l = is3%beg, is3%end
+                    do k = is2%beg, is2%end
+                        do j = is1%beg, is1%end
+                            vL_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
+                            vR_z(j, k, l, i) = v_vf(i)%sf(l, k, j)
                         end do
                     end do
-                    $:END_GPU_PARALLEL_LOOP()
-                end if
-            end if
-        #:endfor
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end if
 
     end subroutine s_reconstruct_cell_boundary_values_capillary
 
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index 8b18381ffb..cd4be44028 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -526,9 +526,10 @@ contains
 
             if (s == 1) then
                 if (run_time_info) then
-                    if (igr) then
+                    if (igr .or. dummy) then
                         call s_write_run_time_information(q_cons_ts(1)%vf, t_step)
-                    else
+                    end if 
+                    if(.not. igr .or. dummy) then
                         call s_write_run_time_information(q_prim_vf, t_step)
                     end if
                 end if
@@ -732,7 +733,7 @@ contains
         real(wp) :: dt_local
         integer :: j, k, l !< Generic loop iterators
 
-        if (.not. igr) then
+        if (.not. igr .or. dummy) then
             call s_convert_conservative_to_primitive_variables( &
                 q_cons_ts(1)%vf, &
                 q_T_sf, &
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index c6b053bdaf..f3fc59ee43 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -95,216 +95,220 @@ contains
         end do
         $:END_GPU_PARALLEL_LOOP()
 
-        if (shear_stress) then    ! Shear stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = -1, 1
-                    do j = is1_viscous%beg, is1_viscous%end
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (shear_stress) then    ! Shear stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
 
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = 1, num_fluids
-                            alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                            if (bubbles_euler .and. num_fluids == 1) then
-                                alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, num_fluids
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
+                            end do
+
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
+
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
                             else
-                                alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                            end if
-                        end do
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
 
-                        if (bubbles_euler) then
-                            rho_visc = 0._wp
-                            gamma_visc = 0._wp
-                            pi_inf_visc = 0._wp
+                                alpha_visc_sum = 0._wp
+
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
+
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+
+                                end if
 
-                            if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
-                            else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-                            else
-                                rho_visc = alpha_rho_visc(1)
-                                gamma_visc = gammas(1)
-                                pi_inf_visc = pi_infs(1)
-                            end if
-                        else
-                            rho_visc = 0._wp
-                            gamma_visc = 0._wp
-                            pi_inf_visc = 0._wp
 
-                            alpha_visc_sum = 0._wp
+                                if (viscous) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, 2
+                                        Re_visc(i) = dflt_real
 
-                            if (mpp_lim) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                    alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                    alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                end do
+                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do q = 1, Re_size(i)
+                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
+                                                         + Re_visc(i)
+                                        end do
 
-                                alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
+                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
 
+                                    end do
+                                end if
                             end if
 
+                            tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + &
+                                            grad_x_vf(2)%sf(j, k, l))/ &
+                                           Re_visc(1)
+
+                            tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) &
+                                            - 2._wp*grad_x_vf(1)%sf(j, k, l) &
+                                            - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
+                                           (3._wp*Re_visc(1))
+                            $:GPU_LOOP(parallelism='[seq]')
+                            do i = 1, 2
+                                tau_Re_vf(contxe + i)%sf(j, k, l) = &
+                                    tau_Re_vf(contxe + i)%sf(j, k, l) - &
+                                    tau_Re(2, i)
+
+                                tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                    tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                    q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
+                            end do
+                        end do
+                    end do
+                end do
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
+
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+            if (bulk_stress) then    ! Bulk stresses
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                do l = is3_viscous%beg, is3_viscous%end
+                    do k = -1, 1
+                        do j = is1_viscous%beg, is1_viscous%end
+
                             $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_fluids
-                                rho_visc = rho_visc + alpha_rho_visc(i)
-                                gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
+                                if (bubbles_euler .and. num_fluids == 1) then
+                                    alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
+                                else
+                                    alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
+                                end if
                             end do
 
-                            if (viscous) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 2
-                                    Re_visc(i) = dflt_real
+                            if (bubbles_euler) then
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
 
-                                    if (Re_size(i) > 0) Re_visc(i) = 0._wp
+                                if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
+                                    end do
+                                else if ((model_eqns == 2) .and. (num_fluids > 2)) then
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
-                                        Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                     + Re_visc(i)
+                                    do i = 1, num_fluids - 1
+                                        rho_visc = rho_visc + alpha_rho_visc(i)
+                                        gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
+                                        pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                     end do
+                                else
+                                    rho_visc = alpha_rho_visc(1)
+                                    gamma_visc = gammas(1)
+                                    pi_inf_visc = pi_infs(1)
+                                end if
+                            else
+                                rho_visc = 0._wp
+                                gamma_visc = 0._wp
+                                pi_inf_visc = 0._wp
 
-                                    Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                end do
-                            end if
-                        end if
-
-                        tau_Re(2, 1) = (grad_y_vf(1)%sf(j, k, l) + &
-                                        grad_x_vf(2)%sf(j, k, l))/ &
-                                       Re_visc(1)
-
-                        tau_Re(2, 2) = (4._wp*grad_y_vf(2)%sf(j, k, l) &
-                                        - 2._wp*grad_x_vf(1)%sf(j, k, l) &
-                                        - 2._wp*q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
-                                       (3._wp*Re_visc(1))
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = 1, 2
-                            tau_Re_vf(contxe + i)%sf(j, k, l) = &
-                                tau_Re_vf(contxe + i)%sf(j, k, l) - &
-                                tau_Re(2, i)
-
-                            tau_Re_vf(E_idx)%sf(j, k, l) = &
-                                tau_Re_vf(E_idx)%sf(j, k, l) - &
-                                q_prim_vf(contxe + i)%sf(j, k, l)*tau_Re(2, i)
-                        end do
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                                alpha_visc_sum = 0._wp
 
-        if (bulk_stress) then    ! Bulk stresses
-            $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
-            do l = is3_viscous%beg, is3_viscous%end
-                do k = -1, 1
-                    do j = is1_viscous%beg, is1_viscous%end
+                                if (mpp_lim) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
+                                        alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
+                                        alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
+                                    end do
 
-                        $:GPU_LOOP(parallelism='[seq]')
-                        do i = 1, num_fluids
-                            alpha_rho_visc(i) = q_prim_vf(i)%sf(j, k, l)
-                            if (bubbles_euler .and. num_fluids == 1) then
-                                alpha_visc(i) = 1._wp - q_prim_vf(E_idx + i)%sf(j, k, l)
-                            else
-                                alpha_visc(i) = q_prim_vf(E_idx + i)%sf(j, k, l)
-                            end if
-                        end do
+                                    alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
 
-                        if (bubbles_euler) then
-                            rho_visc = 0._wp
-                            gamma_visc = 0._wp
-                            pi_inf_visc = 0._wp
+                                end if
 
-                            if (mpp_lim .and. (model_eqns == 2) .and. (num_fluids > 2)) then
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
                                     rho_visc = rho_visc + alpha_rho_visc(i)
                                     gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
                                     pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
                                 end do
-                            else if ((model_eqns == 2) .and. (num_fluids > 2)) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    rho_visc = rho_visc + alpha_rho_visc(i)
-                                    gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                    pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                                end do
-                            else
-                                rho_visc = alpha_rho_visc(1)
-                                gamma_visc = gammas(1)
-                                pi_inf_visc = pi_infs(1)
-                            end if
-                        else
-                            rho_visc = 0._wp
-                            gamma_visc = 0._wp
-                            pi_inf_visc = 0._wp
-
-                            alpha_visc_sum = 0._wp
-
-                            if (mpp_lim) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_visc(i) = max(0._wp, alpha_rho_visc(i))
-                                    alpha_visc(i) = min(max(0._wp, alpha_visc(i)), 1._wp)
-                                    alpha_visc_sum = alpha_visc_sum + alpha_visc(i)
-                                end do
 
-                                alpha_visc = alpha_visc/max(alpha_visc_sum, sgm_eps)
-
-                            end if
+                                if (viscous) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, 2
+                                        Re_visc(i) = dflt_real
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho_visc = rho_visc + alpha_rho_visc(i)
-                                gamma_visc = gamma_visc + alpha_visc(i)*gammas(i)
-                                pi_inf_visc = pi_inf_visc + alpha_visc(i)*pi_infs(i)
-                            end do
+                                        if (Re_size(i) > 0) Re_visc(i) = 0._wp
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do q = 1, Re_size(i)
+                                            Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
+                                                         + Re_visc(i)
+                                        end do
 
-                            if (viscous) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, 2
-                                    Re_visc(i) = dflt_real
+                                        Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
 
-                                    if (Re_size(i) > 0) Re_visc(i) = 0._wp
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do q = 1, Re_size(i)
-                                        Re_visc(i) = alpha_visc(Re_idx(i, q))/Res_viscous(i, q) &
-                                                     + Re_visc(i)
                                     end do
-
-                                    Re_visc(i) = 1._wp/max(Re_visc(i), sgm_eps)
-
-                                end do
+                                end if
                             end if
-                        end if
 
-                        tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + &
-                                        grad_y_vf(2)%sf(j, k, l) + &
-                                        q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
-                                       Re_visc(2)
+                            tau_Re(2, 2) = (grad_x_vf(1)%sf(j, k, l) + &
+                                            grad_y_vf(2)%sf(j, k, l) + &
+                                            q_prim_vf(momxb + 1)%sf(j, k, l)/y_cc(k))/ &
+                                           Re_visc(2)
 
-                        tau_Re_vf(momxb + 1)%sf(j, k, l) = &
-                            tau_Re_vf(momxb + 1)%sf(j, k, l) - &
-                            tau_Re(2, 2)
+                            tau_Re_vf(momxb + 1)%sf(j, k, l) = &
+                                tau_Re_vf(momxb + 1)%sf(j, k, l) - &
+                                tau_Re(2, 2)
 
-                        tau_Re_vf(E_idx)%sf(j, k, l) = &
-                            tau_Re_vf(E_idx)%sf(j, k, l) - &
-                            q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
+                            tau_Re_vf(E_idx)%sf(j, k, l) = &
+                                tau_Re_vf(E_idx)%sf(j, k, l) - &
+                                q_prim_vf(momxb + 1)%sf(j, k, l)*tau_Re(2, 2)
 
+                        end do
                     end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end if
+                $:END_GPU_PARALLEL_LOOP()
+            end if
+        #:endif
 
         if (p == 0) return
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
@@ -636,120 +640,123 @@ contains
 
             if (n > 0) then
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do j = is2_viscous%beg + 1, is2_viscous%end
-                        do k = is1_viscous%beg, is1_viscous%end
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
-                                    (q_prim_qp%vf(i)%sf(k, j, l) - &
-                                     q_prim_qp%vf(i)%sf(k, j - 1, l))/ &
-                                    (y_cc(j) - y_cc(j - 1))
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg + 1, is2_viscous%end
+                            do k = is1_viscous%beg, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
+                                        (q_prim_qp%vf(i)%sf(k, j, l) - &
+                                         q_prim_qp%vf(i)%sf(k, j - 1, l))/ &
+                                        (y_cc(j) - y_cc(j - 1))
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do j = is2_viscous%beg, is2_viscous%end - 1
-                        do k = is1_viscous%beg, is1_viscous%end
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
-                                    (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
-                                     q_prim_qp%vf(i)%sf(k, j, l))/ &
-                                    (y_cc(j + 1) - y_cc(j))
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg, is2_viscous%end - 1
+                            do k = is1_viscous%beg, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dy_n(2)%vf(i)%sf(k, j, l) = &
+                                        (q_prim_qp%vf(i)%sf(k, j + 1, l) - &
+                                         q_prim_qp%vf(i)%sf(k, j, l))/ &
+                                        (y_cc(j + 1) - y_cc(j))
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do j = is2_viscous%beg + 1, is2_viscous%end
-                        do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
-                                    (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                     dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                     dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + &
-                                     dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l))
-
-                                dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                     dqL_prim_dx_n(2)%vf(i)%sf(k, j, l)
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg + 1, is2_viscous%end
+                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
+                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j - 1, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j - 1, l))
+
+                                    dqL_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                         dqL_prim_dx_n(2)%vf(i)%sf(k, j, l)
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do j = is2_viscous%beg, is2_viscous%end - 1
-                        do k = is1_viscous%beg + 1, is1_viscous%end - 1
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
-                                    (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
-                                     dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
-                                     dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
-                                     dqR_prim_dx_n(1)%vf(i)%sf(k, j, l))
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do j = is2_viscous%beg, is2_viscous%end - 1
+                            do k = is1_viscous%beg + 1, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = &
+                                        (dqL_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j + 1, l) + &
+                                         dqL_prim_dx_n(1)%vf(i)%sf(k, j, l) + &
+                                         dqR_prim_dx_n(1)%vf(i)%sf(k, j, l))
 
-                                dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
-                                                                     dqR_prim_dx_n(2)%vf(i)%sf(k, j, l)
+                                    dqR_prim_dx_n(2)%vf(i)%sf(k, j, l) = 25.e-2_wp* &
+                                                                         dqR_prim_dx_n(2)%vf(i)%sf(k, j, l)
 
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = is2_viscous%beg + 1, is2_viscous%end - 1
-                        do j = is1_viscous%beg + 1, is1_viscous%end
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
-                                    (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                     dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                     dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + &
-                                     dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l))
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                            do j = is1_viscous%beg + 1, is1_viscous%end
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
+                                        (dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqL_prim_dy_n(2)%vf(i)%sf(j - 1, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j - 1, k, l))
 
-                                dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                     dqL_prim_dy_n(1)%vf(i)%sf(j, k, l)
+                                    dqL_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                         dqL_prim_dy_n(1)%vf(i)%sf(j, k, l)
 
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
 
-                $:GPU_PARALLEL_LOOP(collapse=3)
-                do l = is3_viscous%beg, is3_viscous%end
-                    do k = is2_viscous%beg + 1, is2_viscous%end - 1
-                        do j = is1_viscous%beg, is1_viscous%end - 1
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = iv%beg, iv%end
-                                dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
-                                    (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
-                                     dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
-                                     dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
-                                     dqR_prim_dy_n(2)%vf(i)%sf(j, k, l))
+                    $:GPU_PARALLEL_LOOP(collapse=3)
+                    do l = is3_viscous%beg, is3_viscous%end
+                        do k = is2_viscous%beg + 1, is2_viscous%end - 1
+                            do j = is1_viscous%beg, is1_viscous%end - 1
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = iv%beg, iv%end
+                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = &
+                                        (dqL_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j + 1, k, l) + &
+                                         dqL_prim_dy_n(2)%vf(i)%sf(j, k, l) + &
+                                         dqR_prim_dy_n(2)%vf(i)%sf(j, k, l))
 
-                                dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
-                                                                     dqR_prim_dy_n(1)%vf(i)%sf(j, k, l)
+                                    dqR_prim_dy_n(1)%vf(i)%sf(j, k, l) = 25.e-2_wp* &
+                                                                         dqR_prim_dy_n(1)%vf(i)%sf(j, k, l)
 
+                                end do
                             end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
+
+                #:endif
 
                 if (p > 0) then
                     #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
@@ -1009,7 +1016,7 @@ contains
         integer :: i, j, k, l
 
         #:for SCHEME, TYPE in [('weno','WENO_TYPE'), ('muscl','MUSCL_TYPE')]
-            if (recon_type == ${TYPE}$) then
+            if (recon_type == ${TYPE}$ .or. dummy) then
                 ! Reconstruction in s1-direction
 
                 if (norm_dir == 1) then
@@ -1048,53 +1055,53 @@ contains
                                        recon_dir, &
                                        is1_viscous, is2_viscous, is3_viscous)
                 end if
+            end if
+        #:endfor
 
-                if (viscous) then
-                    if (weno_Re_flux) then
-                        if (norm_dir == 2) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    do j = is1_viscous%beg, is1_viscous%end
-                                        do k = is2_viscous%beg, is2_viscous%end
-                                            vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
-                                            vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
-                                        end do
-                                    end do
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+                if (norm_dir == 2) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do j = is1_viscous%beg, is1_viscous%end
+                                do k = is2_viscous%beg, is2_viscous%end
+                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
+                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        elseif (norm_dir == 3) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do j = is1_viscous%beg, is1_viscous%end
-                                    do k = is2_viscous%beg, is2_viscous%end
-                                        do l = is3_viscous%beg, is3_viscous%end
-                                            vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
-                                            vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
-                                        end do
-                                    end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 3) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do l = is3_viscous%beg, is3_viscous%end
+                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
+                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        elseif (norm_dir == 1) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    do k = is2_viscous%beg, is2_viscous%end
-                                        do j = is1_viscous%beg, is1_viscous%end
-                                            vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
-                                            vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
-                                        end do
-                                    end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 1) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg, is1_viscous%end
+                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
+                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        end if
-                    end if
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
                 end if
             end if
-        #:endfor
+        end if
 
     end subroutine s_reconstruct_cell_boundary_values_visc
 
@@ -1152,7 +1159,8 @@ contains
                                        is1_viscous, is2_viscous, is3_viscous)
                 end if
 
-                if (viscous) then
+                if (viscous .or. dummy) then
+                    #:if not MFC_CASE_OPTIMIZATION or viscous
                     if (weno_Re_flux) then
                         if (norm_dir == 2) then
                             $:GPU_PARALLEL_LOOP(collapse=4)
@@ -1195,6 +1203,7 @@ contains
                             $:END_GPU_PARALLEL_LOOP()
                         end if
                     end if
+                    #:endif
                 end if
             end if
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 3422dab8ec..6e8b668326 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -647,12 +647,21 @@ contains
         integer, intent(in) :: weno_dir
         type(int_bounds_info), intent(in) :: is1_weno_d, is2_weno_d, is3_weno_d
 
+#:if USING_AMD and not MFC_CASE_OPTIMIZATION
+        real(wp), dimension(-3:2) :: dvd
+        real(wp), dimension(0:4) :: poly
+        real(wp), dimension(0:4) :: alpha
+        real(wp), dimension(0:4) :: omega
+        real(wp), dimension(0:4) :: beta
+        real(wp), dimension(0:4) :: delta
+#:else
         real(wp), dimension(-weno_polyn:weno_polyn - 1) :: dvd
         real(wp), dimension(0:weno_num_stencils) :: poly
         real(wp), dimension(0:weno_num_stencils) :: alpha
         real(wp), dimension(0:weno_num_stencils) :: omega
         real(wp), dimension(0:weno_num_stencils) :: beta
         real(wp), dimension(0:weno_num_stencils) :: delta
+#:endif
         real(wp), dimension(-3:3) :: v ! temporary field value array for clarity (WENO7 only)
         real(wp) :: tau
 
@@ -664,12 +673,12 @@ contains
 
         $:GPU_UPDATE(device='[is1_weno,is2_weno,is3_weno]')
 
-        if (weno_order /= 1) then
+        if (weno_order /= 1 .or. dummy) then
             call s_initialize_weno(v_vf, &
                                    weno_dir)
         end if
 
-        if (weno_order == 1) then
+        if (weno_order == 1 .or. dummy) then
             if (weno_dir == 1) then
                 $:GPU_PARALLEL_LOOP(collapse=4)
                 do i = 1, ubound(v_vf, 1)
@@ -710,7 +719,8 @@ contains
                 end do
                 $:END_GPU_PARALLEL_LOOP()
             end if
-        elseif (weno_order == 3) then
+        end if
+        if (weno_order == 3 .or. dummy) then
             #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                 if (weno_dir == ${WENO_DIR}$) then
                     $:GPU_PARALLEL_LOOP(collapse=4,private='[beta,dvd,poly,omega,alpha,tau]')
@@ -720,6 +730,10 @@ contains
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
+                                    alpha(:) = 0._wp 
+                                    omega(:) = 0._wp 
+                                    beta(:) = weno_eps
+
                                     dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) &
                                              - v_rs_ws_${XYZ}$ (j, k, l, i)
                                     dvd(-1) = v_rs_ws_${XYZ}$ (j, k, l, i) &
@@ -736,19 +750,19 @@ contains
                                               + weno_eps
 
                                     if (wenojs) then
-                                        alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                        alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                     elseif (mapped_weno) then
-                                        alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                        alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                         omega = alpha/sum(alpha)
-                                        alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j))))
+                                        alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                     elseif (wenoz) then
                                         ! Borges, et al. (2008)
 
                                         tau = abs(beta(1) - beta(0))
-                                        alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta)
+                                        alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
 
                                     end if
 
@@ -764,17 +778,17 @@ contains
                                               + poly_coef_cbR_${XYZ}$ (j, 1, 0)*dvd(-1)
 
                                     if (wenojs) then
-                                        alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                        alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                     elseif (mapped_weno) then
-                                        alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                        alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                         omega = alpha/sum(alpha)
-                                        alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j))))
+                                        alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                     elseif (wenoz) then
 
-                                        alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta)
+                                        alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
 
                                     end if
 
@@ -789,7 +803,8 @@ contains
                     $:END_GPU_PARALLEL_LOOP()
                 end if
             #:endfor
-        elseif (weno_order == 5) then
+        end if
+        if (weno_order == 5 .or. dummy) then
             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 1
                 #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                     if (weno_dir == ${WENO_DIR}$) then
@@ -834,29 +849,30 @@ contains
                                                   + weno_eps
 
                                         if (wenojs) then
-                                            alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                         elseif (mapped_weno) then
-                                            alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
-                                            alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                    *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j))))
+                                            alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                    *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
+
                                             ! Borges, et al. (2008)
 
                                             tau = abs(beta(2) - beta(0))                   ! Equation 25
-                                            alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + tau/beta)  ! Equation 28 (note: weno_eps was already added to beta)
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))  ! Equation 28 (note: weno_eps was already added to beta)
 
                                         elseif (teno) then
-                                            ! Fu, et al. (2016)
+                                             ! Fu, et al. (2016)
                                             ! Fu''s code: https://dx.doi.org/10.13140/RG.2.2.36250.34247
                                             tau = abs(beta(2) - beta(0))
                                             alpha = 1._wp + tau/beta                    ! Equation 22 (reuse alpha as gamma; pick C=1 & q=6)
                                             alpha = (alpha*alpha*alpha)**2._wp          ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0)
                                             omega = alpha/sum(alpha)                    ! Equation 25 (reuse omega as xi)
                                             delta = merge(0._wp, 1._wp, omega < teno_CT)! Equation 26
-                                            alpha = delta*d_cbL_${XYZ}$ (:, j)          ! Equation 27
+                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbL_${XYZ}$ (0:weno_num_stencils, j)    ! Equation 27
 
                                         end if
 
@@ -876,21 +892,22 @@ contains
                                                   + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(-1) &
                                                   + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-2)
 
+
                                         if (wenojs) then
-                                            alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                         elseif (mapped_weno) then
-                                            alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
-                                            alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                    *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j))))
+                                            alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                    *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
 
-                                            alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + tau/beta)
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
 
                                         elseif (teno) then
-                                            alpha = delta*d_cbR_${XYZ}$ (:, j)
+                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbR_${XYZ}$ (0:weno_num_stencils, j)
 
                                         end if
 
@@ -911,7 +928,8 @@ contains
                     end if
                 #:endfor
             #:endif
-        elseif (weno_order == 7) then
+        end if
+        if (weno_order == 7 .or. dummy) then
             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 2
                 #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                     if (weno_dir == ${WENO_DIR}$) then
@@ -1026,28 +1044,28 @@ contains
                                         end if
 
                                         if (wenojs) then
-                                            alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                         elseif (mapped_weno) then
-                                            alpha = d_cbL_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
-                                            alpha = (d_cbL_${XYZ}$ (:, j)*(1._wp + d_cbL_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                    *(omega/(d_cbL_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbL_${XYZ}$ (:, j))))
+                                            alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                    *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
                                             ! Castro, et al. (2010)
                                             ! Don & Borges (2013) also helps
                                             tau = abs(beta(3) - beta(0)) ! Equation 50
-                                            alpha = d_cbL_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q) ! q = 2,3,4 for stability
+                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + (tau/beta(0:weno_num_stencils))**wenoz_q) ! q = 2,3,4 for stability
 
                                         elseif (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
                                                 tau = abs(beta(4) - beta(3)) ! Note the reordering of stencils
                                                 alpha = 1._wp + tau/beta
-                                                alpha = (alpha*alpha*alpha)**2._wp ! some CPU compilers cannot optimize x**6.0
+                                                alpha = (alpha**3._wp)**2._wp ! some CPU compilers cannot optimize x**6.0
                                                 omega = alpha/sum(alpha)
                                                 delta = merge(0._wp, 1._wp, omega < teno_CT)
-                                                alpha = delta*d_cbL_${XYZ}$ (:, j)
+                                                alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbL_${XYZ}$ (0:weno_num_stencils, j)
                                             #:endif
                                         end if
 
@@ -1083,19 +1101,20 @@ contains
                                         end if
 
                                         if (wenojs) then
-                                            alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
                                         elseif (mapped_weno) then
-                                            alpha = d_cbR_${XYZ}$ (:, j)/(beta*beta)
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
-                                            alpha = (d_cbR_${XYZ}$ (:, j)*(1._wp + d_cbR_${XYZ}$ (:, j) - 3._wp*omega) + omega**2._wp) &
-                                                    *(omega/(d_cbR_${XYZ}$ (:, j)**2._wp + omega*(1._wp - 2._wp*d_cbR_${XYZ}$ (:, j))))
+                                            alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
+                                                    *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
-                                            alpha = d_cbR_${XYZ}$ (:, j)*(1._wp + (tau/beta)**wenoz_q)
+
+                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
 
                                         elseif (teno) then
-                                            alpha = delta*d_cbR_${XYZ}$ (:, j)
+                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbR_${XYZ}$ (0:weno_num_stencils, j)
 
                                         end if
 
diff --git a/toolchain/mfc/case.py b/toolchain/mfc/case.py
index 269c8438bf..643dd4d188 100644
--- a/toolchain/mfc/case.py
+++ b/toolchain/mfc/case.py
@@ -240,8 +240,8 @@ def __get_sim_fpp(self, print: bool) -> str:
 #:set recon_type            = {recon_type}
 #:set weno_order            = {weno_order}
 #:set weno_polyn            = {weno_polyn}
-#:set muscl_order           = {int(self.params.get("muscl_order", 0))}
-#:set muscl_polyn           = {int(self.params.get("muscl_order", 0))}
+#:set muscl_order           = {int(self.params.get("muscl_order", 1))}
+#:set muscl_polyn           = {int(self.params.get("muscl_order", 1))}
 #:set muscl_lim             = {int(self.params.get("muscl_lim", 1))}
 #:set weno_num_stencils     = {weno_num_stencils}
 #:set nb                    = {int(self.params.get("nb", 1))}

From 13bb65ca58b2abf0d0a4299899b58f502c0e3eb2 Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@login03.frontier.olcf.ornl.gov>
Date: Sun, 25 Jan 2026 08:26:05 -0500
Subject: [PATCH 02/21] 95% of test suite passes

---
 CMakeLists.txt                            |   4 +-
 src/common/m_boundary_common.fpp          |  33 +-
 src/common/m_chemistry.fpp                |  14 +-
 src/common/m_phase_change.fpp             |  77 +++--
 src/common/m_variables_conversion.fpp     |  65 +++-
 src/simulation/include/inline_riemann.fpp |  24 +-
 src/simulation/m_acoustic_src.fpp         |   5 +-
 src/simulation/m_bubbles_EE.fpp           |   7 +-
 src/simulation/m_bubbles_EL.fpp           |  12 +-
 src/simulation/m_cbc.fpp                  |  27 +-
 src/simulation/m_compute_cbc.fpp          | 116 +++++--
 src/simulation/m_data_output.fpp          |   2 +-
 src/simulation/m_hyperelastic.fpp         |   6 +-
 src/simulation/m_ibm.fpp                  |  14 +-
 src/simulation/m_igr.fpp                  | 380 +++++++++++-----------
 src/simulation/m_pressure_relaxation.fpp  |   9 +-
 src/simulation/m_qbmm.fpp                 |  51 ++-
 src/simulation/m_rhs.fpp                  |   4 +-
 src/simulation/m_riemann_solvers.fpp      | 110 +++++--
 src/simulation/m_sim_helpers.fpp          |   4 +-
 src/simulation/m_surface_tension.fpp      |   7 +-
 src/simulation/m_time_steppers.fpp        |  13 +-
 src/simulation/m_viscous.fpp              |  88 ++---
 src/simulation/m_weno.fpp                 | 100 ++++--
 toolchain/mfc/test/case.py                |   2 +
 25 files changed, 786 insertions(+), 388 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0507754dce..2c63582a3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -596,8 +596,8 @@ function(MFC_SETUP_TARGET)
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
-    		find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
-                target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
+		find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
+		target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 4fccf3aa68..333df31442 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -44,7 +44,7 @@ module m_boundary_common
  s_populate_grid_variables_buffers, &
  s_finalize_boundary_common_module
 
-    public :: bc_buffers
+    !public :: bc_buffers
 
 #ifdef MFC_MPI
     public :: MPI_BC_TYPE_TYPE, MPI_BC_BUFFER_TYPE
@@ -54,24 +54,31 @@ contains
 
     impure subroutine s_initialize_boundary_common_module()
 
+        integer :: i, j 
+
         @:ALLOCATE(bc_buffers(1:num_dims, 1:2))
 
         if (bc_io) then
             @:ALLOCATE(bc_buffers(1, 1)%sf(1:sys_size, 0:n, 0:p))
             @:ALLOCATE(bc_buffers(1, 2)%sf(1:sys_size, 0:n, 0:p))
-            @:ACC_SETUP_SFs(bc_buffers(1,1), bc_buffers(1,2))
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             if (n > 0) then
                 @:ALLOCATE(bc_buffers(2,1)%sf(-buff_size:m+buff_size,1:sys_size,0:p))
-                @:ALLOCATE(bc_buffers(2,2)%sf(-buff_size:m+buff_size,1:sys_size,0:p))
-                @:ACC_SETUP_SFs(bc_buffers(2,1), bc_buffers(2,2))
-                if (p > 0) then
-                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                        @:ALLOCATE(bc_buffers(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
-                        @:ALLOCATE(bc_buffers(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
-                        @:ACC_SETUP_SFs(bc_buffers(3,1), bc_buffers(3,2))
-                    #:endif
-                end if
+                @:ALLOCATE(bc_buffers(2,2)%sf(-buff_size:m+buff_size,1:sys_size,0:p))                
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                if (p > 0) then            
+                    @:ALLOCATE(bc_buffers(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
+                    @:ALLOCATE(bc_buffers(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
+                end if  
+                #:endif
             end if
+            #:endif
+            do i = 1, num_dims
+                do j = 1, 2
+                @:ACC_SETUP_SFs(bc_buffers(i,j))
+                end do
+            end do
+
         end if
 
     end subroutine s_initialize_boundary_common_module
@@ -1056,6 +1063,7 @@ contains
                 end do
             end if
         elseif (bc_dir == 2) then !< y-direction
+#:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             if (bc_loc == -1) then !< bc_y%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
@@ -1071,7 +1079,9 @@ contains
                     end do
                 end do
             end if
+#:endif
         elseif (bc_dir == 3) then !< z-direction
+#:if not MFC_CASE_OPTIMIZATION or num_dims > 2
             if (bc_loc == -1) then !< bc_z%beg
                 do i = 1, sys_size
                     do j = 1, buff_size
@@ -1087,6 +1097,7 @@ contains
                     end do
                 end do
             end if
+#:endif
         end if
 #else
         call s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 5d3d1b7811..4e89ea4a8f 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -126,8 +126,13 @@ contains
         integer :: eqn
         real(wp) :: T
         real(wp) :: rho, omega_m
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(10) :: Ys
+        real(wp), dimension(10) :: omega
+#:else 
         real(wp), dimension(num_species) :: Ys
         real(wp), dimension(num_species) :: omega
+#:endif
 
         $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]')
         do z = bounds(3)%beg, bounds(3)%end
@@ -169,11 +174,18 @@ contains
         type(int_bounds_info), intent(in) :: irx, iry, irz
 
         integer, intent(in) :: idir
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
+        real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
+        real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
+        real(wp), dimension(10) :: Mass_Diffu_Flux, dYk_dxi
+#:else 
         real(wp), dimension(num_species) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
         real(wp), dimension(num_species) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
         real(wp), dimension(num_species) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
         real(wp), dimension(num_species) :: Mass_Diffu_Flux, dYk_dxi
+#:endif
+
         real(wp) :: Mass_Diffu_Energy
         real(wp) :: MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, T_L, T_R, P_L, P_R, rho_L, rho_R, rho_cell, rho_Vic
         real(wp) :: lambda_L, lambda_R, lambda_Cell, dT_dxi, grid_spacing
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index cba9744427..7a88a7b323 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -1,6 +1,6 @@
 !> energies (6-eqn to 4-eqn) equilibrium through an infinitely fast (algebraic)
 !> procedure.
-
+#:include 'case.fpp'
 #:include 'macros.fpp'
 
 module m_phase_change
@@ -91,15 +91,18 @@ contains
 
         ! $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]')
         ! $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]')
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+#:else 
         real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+#:endif
         ! $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]')
 
         !< Generic loop iterators
         integer :: i, j, k, l
 
         ! starting equilibrium solver
-        $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
+        $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok,pS, pSOV, pSSL, TS, TSOV, TSatOV, TSatSL, TSSL, rhoe, dynE, rhos, rho, rM, m1, m2, MCT, TvF]')
         do j = 0, m
             do k = 0, n
                 do l = 0, p
@@ -231,25 +234,28 @@ contains
 
                     ! Calculations AFTER equilibrium
 
-                    ! entropy
-                    sk(1:num_fluids) = cvs(1:num_fluids)*log((TS**gs_min(1:num_fluids)) &
-                                                             /((pS + ps_inf(1:num_fluids))**(gs_min(1:num_fluids) - 1.0_wp))) + qvps(1:num_fluids)
+                    $:GPU_LOOP(parallelism='[seq]')
+                    do i = 1, num_fluids
+                        ! entropy
+                        sk(i) = cvs(i)*log((TS**gs_min(i)) &
+                                                                 /((pS + ps_inf(i))**(gs_min(i) - 1.0_wp))) + qvps(i)
 
-                    ! enthalpy
-                    hk(1:num_fluids) = gs_min(1:num_fluids)*cvs(1:num_fluids)*TS &
-                                       + qvs(1:num_fluids)
+                        ! enthalpy
+                        hk(i) = gs_min(i)*cvs(i)*TS &
+                                           + qvs(i)
 
-                    ! Gibbs-free energy
-                    gk(1:num_fluids) = hk(1:num_fluids) - TS*sk(1:num_fluids)
+                        ! Gibbs-free energy
+                        gk(i) = hk(i) - TS*sk(i)
 
-                    ! densities
-                    rhok(1:num_fluids) = (pS + ps_inf(1:num_fluids)) &
-                                         /((gs_min(1:num_fluids) - 1)*cvs(1:num_fluids)*TS)
+                        ! densities
+                        rhok(i) = (pS + ps_inf(i)) &
+                                             /((gs_min(i) - 1)*cvs(i)*TS)
 
-                    ! internal energy
-                    ek(1:num_fluids) = (pS + gs_min(1:num_fluids) &
-                                        *ps_inf(1:num_fluids))/(pS + ps_inf(1:num_fluids)) &
-                                       *cvs(1:num_fluids)*TS + qvs(1:num_fluids)
+                        ! internal energy
+                        ek(i) = (pS + gs_min(i) &
+                                            *ps_inf(i))/(pS + ps_inf(i)) &
+                                           *cvs(i)*TS + qvs(i)
+                    end do
 
                     ! calculating volume fractions, internal energies, and total entropy
                     rhos = 0.0_wp
@@ -293,7 +299,11 @@ contains
         ! initializing variables
         integer, intent(in) :: j, k, l, MFL
         real(wp), intent(out) :: pS
-        real(wp), dimension(num_fluids), intent(out) :: p_infpT
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(out) :: p_infpT
+#:else
+        real(wp), dimension(num_fluids), intent(out) :: p_infpT 
+#:endif
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         real(wp), intent(in) :: rhoe
         real(wp), intent(out) :: TS
@@ -302,7 +312,11 @@ contains
         integer :: i, ns !< generic loop iterators
 
         ! auxiliary variables for the pT-equilibrium solver
-        mCP = 0.0_wp; mQ = 0.0_wp; p_infpT = ps_inf; 
+        mCP = 0.0_wp; mQ = 0.0_wp; 
+        $:GPU_LOOP(parallelism='[seq]')
+        do i = 1, num_fluids 
+            p_infpT(i) = ps_inf(i)
+        end do
         ! Performing tests before initializing the pT-equilibrium
         $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids
@@ -315,6 +329,13 @@ contains
 
         end do
 
+        if(num_fluids < 3) then
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = num_fluids+1, 3 
+                p_infpT(i) = 10**32_wp
+            end do
+        end if
+
         ! Checking energy constraint
         if ((rhoe - mQ - minval(p_infpT)) < 0.0_wp) then
 
@@ -392,12 +413,19 @@ contains
 
         integer, intent(in) :: j, k, l
         real(wp), intent(inout) :: pS
-        real(wp), dimension(num_fluids), intent(in) :: p_infpT
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(3), intent(in) :: p_infpT
+#:else
+        real(wp), dimension(num_fluids), intent(in) :: p_infpT 
+#:endif
         real(wp), intent(in) :: rhoe
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         real(wp), intent(inout) :: TS
-
-        real(wp), dimension(num_fluids) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium
+#:else
+        real(wp), dimension(num_fluids) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium 
+#:endif
         real(wp), dimension(2, 2) :: Jac, InvJac, TJac !< matrices for the Newton Solver
         real(wp), dimension(2) :: R2D, DeltamP !< residual and correction array
         real(wp) :: Om ! underrelaxation factor
@@ -477,7 +505,8 @@ contains
             call s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac)
 
             ! calculating correction array for Newton's method
-            DeltamP = -1.0_wp*matmul(InvJac, R2D)
+            DeltamP(1) = -1.0_wp*(InvJac(1,1)*R2D(1) + InvJac(1,2)*R2D(2))
+            DeltamP(2) = -1.0_wp*(InvJac(2,1)*R2D(1) + InvJac(2,2)*R2D(2))
 
             ! updating two reacting 'masses'. Recall that inert 'masses' do not change during the phase change
             ! liquid
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index 144e7bce95..beb38bacf5 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -124,10 +124,11 @@ contains
 
         ! Chemistry
         real(wp), dimension(1:num_species), intent(in) :: rhoYks
+        real(wp), dimension(1:num_species) :: Y_rs
         real(wp) :: E_e
         real(wp) :: e_Per_Kg, Pdyn_Per_Kg
         real(wp) :: T_guess
-        real(wp), dimension(1:num_species) :: Y_rs
+        
 
         integer :: s !< Generic loop iterator
 
@@ -251,9 +252,10 @@ contains
         real(wp), intent(out), target :: qv
 
         real(wp), optional, dimension(2), intent(out) :: Re_K
-        real(wp), optional, intent(out) :: G_K
+        real(wp), optional, intent(out) :: G_K 
+        real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
         real(wp), optional, dimension(num_fluids), intent(in) :: G
-        real(wp), dimension(num_fluids) :: alpha_rho_K, alpha_K !<
+        
 
         integer :: i, j !< Generic loop iterator
 
@@ -321,10 +323,17 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(inout) :: alpha_rho_K, alpha_K !<
+        real(wp), optional, dimension(3), intent(in) :: G
+#:else 
         real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
+        real(wp), optional, dimension(num_fluids), intent(in) :: G
+#:endif
         real(wp), dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out) :: G_K
-        real(wp), optional, dimension(num_fluids), intent(in) :: G
+        real(wp) :: alpha_K_sum
+        
 
         integer :: i, j !< Generic loop iterators
 
@@ -340,11 +349,13 @@ contains
             qv_K = qvs(1)
         else
             if (mpp_lim) then
+                alpha_K_sum = 0._wp
                 do i = 1, num_fluids
                     alpha_rho_K(i) = max(0._wp, alpha_rho_K(i))
                     alpha_K(i) = min(max(0._wp, alpha_K(i)), 1._wp)
+                    alpha_K_sum = alpha_K_sum + alpha_K(i)
                 end do
-                alpha_K = alpha_K/max(sum(alpha_K), sgm_eps)
+                alpha_K = alpha_K/max(alpha_K_sum, sgm_eps)
             end if
             rho_K = 0._wp; gamma_K = 0._wp; pi_inf_K = 0._wp; qv_K = 0._wp
             do i = 1, num_fluids
@@ -579,13 +590,17 @@ contains
         type(scalar_field), intent(inout) :: q_T_sf
         type(scalar_field), dimension(sys_size), intent(inout) :: qK_prim_vf
         type(int_bounds_info), dimension(1:3), intent(in) :: ibounds
-
+#:if USING_AMD and not MFC_CASE_OPTIMIZATION
+        real(wp), dimension(3) :: alpha_K, alpha_rho_K
+        real(wp), dimension(3) :: nRtmp
+        real(wp) :: rhoYks(1:10)
+#:else
         real(wp), dimension(num_fluids) :: alpha_K, alpha_rho_K
+        real(wp), dimension(nb) :: nRtmp
+        real(wp) :: rhoYks(1:num_species) 
+#:endif
         real(wp), dimension(2) :: Re_K
         real(wp) :: rho_K, gamma_K, pi_inf_K, qv_K, dyn_pres_K
-        real(wp), dimension(nb) :: nRtmp
-
-        real(wp) :: rhoYks(1:num_species)
 
         real(wp) :: vftmp, nbub_sc
 
@@ -1166,19 +1181,27 @@ contains
                                                      is1, is2, is3, s2b, s3b)
 
         integer, intent(in) :: s2b, s3b
-        real(wp), dimension(0:, s2b:, s3b:, 1:), intent(in) :: qK_prim_vf
-        real(wp), dimension(0:, s2b:, s3b:, 1:), intent(inout) :: FK_vf
-        real(wp), dimension(0:, s2b:, s3b:, advxb:), intent(inout) :: FK_src_vf
+        real(wp), dimension(0:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(in) :: qK_prim_vf
+        real(wp), dimension(0:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:), intent(inout) :: FK_vf
+        real(wp), dimension(0:, idwbuff(2)%beg:, idwbuff(3)%beg:, advxb:), intent(inout) :: FK_src_vf
 
         type(int_bounds_info), intent(in) :: is1, is2, is3
 
         ! Partial densities, density, velocity, pressure, energy, advection
         ! variables, the specific heat ratio and liquid stiffness functions,
         ! the shear and volume Reynolds numbers and the Weber numbers
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_rho_K
+        real(wp), dimension(3) :: alpha_K
+        real(wp), dimension(3) :: vel_K
+        real(wp), dimension(10) :: Y_K
+#:else
         real(wp), dimension(num_fluids) :: alpha_rho_K
         real(wp), dimension(num_fluids) :: alpha_K
-        real(wp) :: rho_K
         real(wp), dimension(num_vels) :: vel_K
+        real(wp), dimension(num_species) :: Y_K 
+#:endif
+        real(wp) :: rho_K
         real(wp) :: vel_K_sum
         real(wp) :: pres_K
         real(wp) :: E_K
@@ -1187,7 +1210,6 @@ contains
         real(wp) :: qv_K
         real(wp), dimension(2) :: Re_K
         real(wp) :: G_K
-        real(wp), dimension(num_species) :: Y_K
         real(wp) :: T_K, mix_mol_weight, R_gas
 
         integer :: i, j, k, l !< Generic loop iterators
@@ -1315,8 +1337,13 @@ contains
             & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(in) :: q_vf
         integer, intent(in) :: k, l, r
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
+#:else 
         real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
+#:endif
         integer :: i
+        real(wp) :: alpha_K_sum
 
         if (num_fluids == 1) then
             alpha_rho_K(1) = q_vf(contxb)%sf(k, l, r)
@@ -1342,11 +1369,13 @@ contains
         end if
 
         if (mpp_lim) then
+            alpha_K_sum = 0._wp
             do i = 1, num_fluids
                 alpha_rho_K(i) = max(0._wp, alpha_rho_K(i))
                 alpha_K(i) = min(max(0._wp, alpha_K(i)), 1._wp)
+                alpha_K_sum = alpha_K_sum + alpha_K(i)
             end do
-            alpha_K = alpha_K/max(sum(alpha_K), 1.e-16_wp)
+            alpha_K = alpha_K/max(alpha_K_sum, 1.e-16_wp)
         end if
 
         if (num_fluids == 1 .and. bubbles_euler) alpha_K(1) = q_vf(advxb)%sf(k, l, r)
@@ -1382,7 +1411,11 @@ contains
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf, qv
         real(wp), intent(in) :: H
-        real(wp), dimension(num_fluids), intent(in) :: adv
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(in) :: adv
+#:else
+        real(wp), dimension(num_fluids), intent(in) :: adv 
+#:endif
         real(wp), intent(in) :: vel_sum
         real(wp), intent(in) :: c_c
         real(wp), intent(out) :: c
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index 3d1095324a..5b4598c0aa 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -40,16 +40,34 @@
         eps = 0.001_wp
         call get_species_enthalpies_rt(T_L, h_iL)
         call get_species_enthalpies_rt(T_R, h_iR)
-
+#:if USING_AMD
+        h_iL = h_iL*gas_constant/molecular_weights_nonparameter*T_L
+        h_iR = h_iR*gas_constant/molecular_weights_nonparameter*T_R
+#:else 
         h_iL = h_iL*gas_constant/molecular_weights*T_L
         h_iR = h_iR*gas_constant/molecular_weights*T_R
+#:endif
         call get_species_specific_heats_r(T_L, Cp_iL)
         call get_species_specific_heats_r(T_R, Cp_iR)
 
         h_avg_2 = (sqrt(rho_L)*h_iL + sqrt(rho_R)*h_iR)/(sqrt(rho_L) + sqrt(rho_R))
         Yi_avg = (sqrt(rho_L)*Ys_L + sqrt(rho_R)*Ys_R)/(sqrt(rho_L) + sqrt(rho_R))
         T_avg = (sqrt(rho_L)*T_L + sqrt(rho_R)*T_R)/(sqrt(rho_L) + sqrt(rho_R))
+#:if USING_AMD
+        if (abs(T_L - T_R) < eps) then
+            ! Case when T_L and T_R are very close
+            Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:))
+            Cv_avg = sum(Yi_avg(:)*((0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:) - gas_constant/molecular_weights_nonparameter(:)))
+        else
+            ! Normal calculation when T_L and T_R are sufficiently different
+            Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
+            Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights_nonparameter(:)))
+        end if
+        gamma_avg = Cp_avg/Cv_avg
 
+        Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights_nonparameter(:)*T_avg
+        c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
+#:else 
         if (abs(T_L - T_R) < eps) then
             ! Case when T_L and T_R are very close
             Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights(:))
@@ -59,11 +77,13 @@
             Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
             Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights(:)))
         end if
-
         gamma_avg = Cp_avg/Cv_avg
 
         Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights(:)*T_avg
         c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
+#:endif
+
+        
     end if
 
 #:enddef roe_avg
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 15864f1e2d..458f8dd514 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -143,8 +143,11 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
 
         integer, intent(in) :: t_step
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: myalpha, myalpha_rho
+#:else 
         real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
+#:endif
         real(wp) :: myRho, B_tait
         real(wp) :: sim_time, c, small_gamma
         real(wp) :: frequency_local, gauss_sigma_time_local
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index ec45d58fe9..6fda2da3b1 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -165,9 +165,14 @@ contains
         real(wp) :: rddot
         real(wp) :: pb_local, mv_local, vflux, pbdot
         real(wp) :: n_tait, B_tait
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: Rtmp, Vtmp
+        real(wp), dimension(3) :: myalpha, myalpha_rho
+#:else 
         real(wp), dimension(nb) :: Rtmp, Vtmp
-        real(wp) :: myR, myV, alf, myP, myRho, R2Vav, R3
         real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
+#:endif
+        real(wp) :: myR, myV, alf, myP, myRho, R2Vav, R3
         real(wp) :: nbub !< Bubble number density
         real(wp) :: my_divu
 
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 6f555fd59b..2f9937f140 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -537,7 +537,11 @@ contains
         real(wp) :: myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot
         real(wp) :: myPinf, aux1, aux2, myCson, myRho
         real(wp) :: gamma, pi_inf, qv
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: myalpha_rho, myalpha
+#:else 
         real(wp), dimension(contxe) :: myalpha_rho, myalpha
+#:endif
         real(wp), dimension(2) :: Re
         integer, dimension(3) :: cell
 
@@ -551,7 +555,7 @@ contains
         ! Subgrid p_inf model based on Maeda and Colonius (2018).
         if (lag_params%pressure_corrector) then
             ! Calculate velocity potentials (valid for one bubble per cell)
-            $:GPU_PARALLEL_LOOP(private='[k,cell]')
+            $:GPU_PARALLEL_LOOP(private='[k,cell,paux,preterm1,term2,Romega,myR0,myR,myV,myPb,pint,term1_fac]')
             do k = 1, nBubs
                 call s_get_pinf(k, q_prim_vf, 2, paux, cell, preterm1, term2, Romega)
                 myR0 = bub_R0(k)
@@ -573,7 +577,7 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(private='[k,i,myalpha_rho,myalpha,Re,cell]', &
+        $:GPU_PARALLEL_LOOP(private='[k,i,myalpha_rho,myalpha,myVapFlux,preterm1, term2, paux, pint, Romega, term1_fac,myR_m, mygamma_m, myPb, myMass_n, myMass_v,myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot,myPinf, aux1, aux2, myCson, myRho,gamma,pi_inf,qv,dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu,adap_dt_stop]', &
             & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
             & copy='[adap_dt_stop_max]',copyin='[stage]')
         do k = 1, nBubs
@@ -778,7 +782,11 @@ contains
         real(wp), intent(out) :: cson
 
         real(wp) :: E, H
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: vel
+#:else 
         real(wp), dimension(num_dims) :: vel
+#:endif
         integer :: i
 
         $:GPU_LOOP(parallelism='[seq]')
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 0950072042..0f3f0011b7 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -644,25 +644,32 @@ contains
         integer, intent(in) :: cbc_dir_norm, cbc_loc_norm
 
         type(int_bounds_info), intent(in) :: ix, iy, iz
-
-        ! First-order time derivatives of the partial densities, density,
-        ! velocity, pressure, advection variables, and the specific heat
-        ! ratio and liquid stiffness functions
-
-        real(wp), dimension(num_fluids) :: dalpha_rho_dt
         real(wp) :: drho_dt
-        real(wp), dimension(num_dims) :: dvel_dt
         real(wp) :: dpres_dt
-        real(wp), dimension(num_fluids) :: dadv_dt
         real(wp) :: dgamma_dt
         real(wp) :: dpi_inf_dt
         real(wp) :: dqv_dt
         real(wp) :: dpres_ds
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_rho, dalpha_rho_ds, mf
+        real(wp), dimension(3) :: vel, dvel_ds
+        real(wp), dimension(3) :: adv_local, dadv_ds
+        real(wp), dimension(12) :: L
+        real(wp), dimension(3) :: dadv_dt
+        real(wp), dimension(3) :: dvel_dt
+        real(wp), dimension(3) :: dalpha_rho_dt
+        real(wp), dimension(10) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
+#:else 
         real(wp), dimension(contxe) :: alpha_rho, dalpha_rho_ds, mf
-        real(wp), dimension(2) :: Re_cbc
         real(wp), dimension(num_vels) :: vel, dvel_ds
         real(wp), dimension(num_fluids) :: adv_local, dadv_ds
         real(wp), dimension(sys_size) :: L
+        real(wp), dimension(num_fluids) :: dadv_dt
+        real(wp), dimension(num_dims) :: dvel_dt
+        real(wp), dimension(num_fluids) :: dalpha_rho_dt
+        real(wp), dimension(num_species) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
+#:endif
+        real(wp), dimension(2) :: Re_cbc
         real(wp), dimension(3) :: lambda
 
         real(wp) :: rho         !< Cell averaged density
@@ -676,7 +683,7 @@ contains
         real(wp) :: Ma
         real(wp) :: T, sum_Enthalpies
         real(wp) :: Cv, Cp, e_mix, Mw, R_gas
-        real(wp), dimension(num_species) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
+        
 
         real(wp) :: vel_K_sum, vel_dv_dt_sum
 
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 594924e055..aabd7d726d 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -1,7 +1,7 @@
 !>
 !! @file m_compute_cbc.f90
 !! @brief CBC computation module
-
+#:include 'case.fpp'
 #:include 'macros.fpp'
 
 module m_compute_cbc
@@ -23,7 +23,11 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in) :: rho, c, dpres_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else
+        real(wp), dimension(num_dims), intent(in) :: dvel_ds 
+#:endif
         real(wp) :: L1
         L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
     end function f_base_L1
@@ -31,9 +35,14 @@ contains
     !> Fill density L variables
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+#:endif
+        real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), intent(in) :: dpres_ds
         integer :: i
 
@@ -46,9 +55,14 @@ contains
     !> Fill velocity L variables
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
+#:endif
+        real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
         ! $:GPU_LOOP(parallelism='[seq]')
@@ -60,9 +74,14 @@ contains
     !> Fill advection L variables
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: dadv_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+#:endif
+        real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
         ! $:GPU_LOOP(parallelism='[seq]')
@@ -74,9 +93,14 @@ contains
     !> Fill chemistry L variables
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(10), intent(in) :: dYs_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: lambda_factor, lambda2
         real(wp), dimension(num_species), intent(in) :: dYs_ds
+#:endif
+        real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
         if (.not. chemistry) return
@@ -93,9 +117,14 @@ contains
             & cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c, dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
+#:endif
+        real(wp), intent(in) :: rho, c, dpres_ds
         integer :: i
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
@@ -109,13 +138,22 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        real(wp), dimension(3), intent(in) :: dvel_ds
+        real(wp), dimension(3), intent(in) :: dadv_ds
+        real(wp), dimension(10), intent(in) :: dYs_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
+#:endif
+        real(wp), intent(in) :: rho, c
+        real(wp), intent(in) :: dpres_ds
+
         real(wp) :: lambda_factor
 
         lambda_factor = (5.e-1_wp - 5.e-1_wp*sign(1._wp, lambda(1)))
@@ -137,9 +175,15 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else
         real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(num_dims), intent(in) :: dvel_ds 
+#:endif
         real(wp), intent(in) :: rho, c, dpres_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         L(2:advxe) = 0._wp
@@ -152,13 +196,21 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        real(wp), dimension(3), intent(in) :: dvel_ds
+        real(wp), dimension(3), intent(in) :: dadv_ds
+        real(wp), dimension(10), intent(in) :: dYs_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
+#:endif
+        real(wp), intent(in) :: rho, c
+        real(wp), intent(in) :: dpres_ds
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
@@ -174,12 +226,19 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        real(wp), dimension(3), intent(in) :: dvel_ds
+        real(wp), dimension(3), intent(in) :: dadv_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+#:endif
+        real(wp), intent(in) :: rho, c
+        real(wp), intent(in) :: dpres_ds
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
@@ -194,12 +253,20 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        real(wp), dimension(3), intent(in) :: dvel_ds
+        real(wp), dimension(3), intent(in) :: dadv_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+#:endif
+        real(wp), intent(in) :: rho, c
+        real(wp), intent(in) :: dpres_ds
+
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
@@ -212,8 +279,11 @@ contains
     subroutine s_compute_supersonic_inflow_L(L)
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
             & parallelism='[seq]', cray_inline=True)
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
+#:endif
         L(1:advxe) = 0._wp
         if (chemistry) L(chemxb:chemxe) = 0._wp
     end subroutine s_compute_supersonic_inflow_L
@@ -224,13 +294,21 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(12), intent(inout) :: L
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        real(wp), dimension(3), intent(in) :: dvel_ds
+        real(wp), dimension(3), intent(in) :: dadv_ds
+        real(wp), dimension(10), intent(in) :: dYs_ds
+#:else 
         real(wp), dimension(sys_size), intent(inout) :: L
-        real(wp), intent(in) :: rho, c
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), intent(in) :: dpres_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
         real(wp), dimension(num_species), intent(in) :: dYs_ds
+#:endif
+        real(wp), intent(in) :: rho, c
+        real(wp), intent(in) :: dpres_ds
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index d239532f43..97c1ce5a4d 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -269,7 +269,7 @@ contains
 
         real(wp) :: rho        !< Cell-avg. density
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(2) :: alpha      !< Cell-avg. volume fraction
+        real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
         real(wp), dimension(3) :: vel        !< Cell-avg. velocity
 #:else
         real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 76e0c74841..599ef3302f 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -98,9 +98,13 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(9) :: tensora, tensorb
+        real(wp), dimension(3) :: alpha_k, alpha_rho_k
+#:else 
         real(wp), dimension(tensor_size) :: tensora, tensorb
         real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
+#:endif
         real(wp), dimension(2) :: Re
         real(wp) :: rho, gamma, pi_inf, qv
         real(wp) :: G_local
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 2c5d133060..1d4d712226 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -175,15 +175,23 @@ contains
         real(wp), dimension(2) :: Re_K
         real(wp) :: G_K
         real(wp) :: qv_K
-        real(wp), dimension(num_fluids) :: Gs
 
         real(wp) :: pres_IP
         real(wp), dimension(3) :: vel_IP, vel_norm_IP
         real(wp) :: c_IP
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
+        real(wp), dimension(3) :: Gs
+        real(wp), dimension(3) :: alpha_rho_IP, alpha_IP
+        real(wp), dimension(3) :: r_IP, v_IP, pb_IP, mv_IP
+        real(wp), dimension(18) :: nmom_IP
+        real(wp), dimension(12) :: presb_IP, massv_IP
+#:else 
+        real(wp), dimension(num_fluids) :: Gs
         real(wp), dimension(num_fluids) :: alpha_rho_IP, alpha_IP
         real(wp), dimension(nb) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), dimension(nb*nmom) :: nmom_IP
         real(wp), dimension(nb*nnode) :: presb_IP, massv_IP
+#:endif
         !! Primitive variables at the image point associated with a ghost point,
         !! interpolated from surrounding fluid cells.
 
@@ -854,7 +862,11 @@ contains
         real(wp), intent(INOUT) :: pres_IP
         real(wp), dimension(3), intent(INOUT) :: vel_IP
         real(wp), intent(INOUT) :: c_IP
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3), intent(INOUT) :: alpha_IP, alpha_rho_IP
+#:else 
         real(wp), dimension(num_fluids), intent(INOUT) :: alpha_IP, alpha_rho_IP
+#:endif
         real(wp), optional, dimension(:), intent(INOUT) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), optional, dimension(:), intent(INOUT) :: nmom_IP
         real(wp), optional, dimension(:), intent(INOUT) :: presb_IP, massv_IP
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index e99c9a6401..7e27a51de9 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -48,9 +48,6 @@ module m_igr
         integer :: vidxb, vidxe
         $:GPU_DECLARE(create='[vidxb, vidxe]')
 
-        integer :: offxL, offxR
-        $:GPU_DECLARE(create='[offxL, offxR]')
-
         real(wp), allocatable, dimension(:) :: coeff_L, coeff_R
         $:GPU_DECLARE(create='[coeff_L, coeff_R]')
     #:else
@@ -58,69 +55,65 @@ module m_igr
             integer, parameter :: vidxb = -2
             integer, parameter :: vidxe = 3
 
-            integer, parameter :: offxL = 2
-            integer, parameter :: offxR = 3
 #if defined(MFC_OpenMP)
-            real(wp) :: coeff_L(1:5) = [ &
-                        -3._wp/60._wp, &  ! Index -1
-                        27._wp/60._wp, &  ! Index 0
-                        47._wp/60._wp, &  ! Index 1
-                        -13._wp/60._wp, &  ! Index 2
-                        2._wp/60._wp &  ! Index 3
-                        ]
-
-            real(wp) :: coeff_R(1:5) = [ &
-                        2._wp/60._wp, &  ! Index -2
-                        -13._wp/60._wp, &  ! Index -1
-                        47._wp/60._wp, &  ! Index 0
-                        27._wp/60._wp, &  ! Index 1
-                        -3._wp/60._wp &  ! Index 2
-                        ]
+            real(wp) :: coeff_L(-1:3) = [ &
+                            -3._wp/60._wp, &  ! Index -1
+                            27._wp/60._wp, &  ! Index 0
+                            47._wp/60._wp, &  ! Index 1
+                            -13._wp/60._wp, &  ! Index 2
+                            2._wp/60._wp &  ! Index 3
+                            ]
+
+            real(wp) :: coeff_R(-2:2) = [ &
+                            2._wp/60._wp, &  ! Index -2
+                            -13._wp/60._wp, &  ! Index -1
+                            47._wp/60._wp, &  ! Index 0
+                            27._wp/60._wp, &  ! Index 1
+                            -3._wp/60._wp &  ! Index 2
+                            ]
 #else
-            real(wp), parameter :: coeff_L(1:5) = [ &
-                                   -3._wp/60._wp, &  ! Index -1
-                                   27._wp/60._wp, &  ! Index 0
-                                   47._wp/60._wp, &  ! Index 1
-                                   -13._wp/60._wp, &  ! Index 2
-                                   2._wp/60._wp &  ! Index 3
-                                   ]
-
-            real(wp), parameter :: coeff_R(1:5) = [ &
-                                   2._wp/60._wp, &  ! Index -2
-                                   -13._wp/60._wp, &  ! Index -1
-                                   47._wp/60._wp, &  ! Index 0
-                                   27._wp/60._wp, &  ! Index 1
-                                   -3._wp/60._wp &  ! Index 2
-                                   ]
+            real(wp), parameter :: coeff_L(-1:3) = [ &
+                                       -3._wp/60._wp, &  ! Index -1
+                                       27._wp/60._wp, &  ! Index 0
+                                       47._wp/60._wp, &  ! Index 1
+                                       -13._wp/60._wp, &  ! Index 2
+                                       2._wp/60._wp &  ! Index 3
+                                       ]
+
+            real(wp), parameter :: coeff_R(-2:2) = [ &
+                                       2._wp/60._wp, &  ! Index -2
+                                       -13._wp/60._wp, &  ! Index -1
+                                       47._wp/60._wp, &  ! Index 0
+                                       27._wp/60._wp, &  ! Index 1
+                                       -3._wp/60._wp &  ! Index 2
+                                       ]
 #endif
         #:elif igr_order == 3
             integer, parameter :: vidxb = -1
             integer, parameter :: vidxe = 2
 
-            integer, parameter :: offxL = 1
-            integer, parameter :: offxR = 2
 #if defined(MFC_OpenMP)
-            real(wp) :: coeff_L(1:3) = [ &
-                        2._wp/6._wp, & ! Index 0
-                        5._wp/6._wp, & ! Index 1
-                        -1._wp/6._wp & ! Index 2
-                        ]
-            real(wp) :: coeff_R(1:3) = [ &
-                        -1._wp/6._wp, & ! Index -1
-                        5._wp/6._wp, & ! Index 0
-                        2._wp/6._wp & ! Index 1
-                        ]
+            real(wp) :: coeff_L(0:2) = [ &
+                            2._wp/6._wp, & ! Index 0
+                            5._wp/6._wp, & ! Index 1
+                            -1._wp/6._wp & ! Index 2
+                            ]
+            real(wp) :: coeff_R(-1:1) = [ &
+                            -1._wp/6._wp, & ! Index -1
+                            5._wp/6._wp, & ! Index 0
+                            2._wp/6._wp & ! Index 1
+                            ]
 #else
-            real(wp), parameter :: coeff_L(1:3) = [ &
-                                   2._wp/6._wp, & ! Index 0
-                                   5._wp/6._wp, & ! Index 1
-                                   -1._wp/6._wp & ! Index 2
-                                   ]
-            real(wp), parameter :: coeff_R(1:3) = [ &
-                                   -1._wp/6._wp, & ! Index -1
-                                   5._wp/6._wp, & ! Index 0
-                                   2._wp/6._wp & ! Index 1
-                                   ]
+            real(wp), parameter :: coeff_L(0:2) = [ &
+                                       2._wp/6._wp, & ! Index 0
+                                       5._wp/6._wp, & ! Index 1
+                                       -1._wp/6._wp & ! Index 2
+                                       ]
+            real(wp), parameter :: coeff_R(-1:1) = [ &
+                                       -1._wp/6._wp, & ! Index -1
+                                       5._wp/6._wp, & ! Index 0
+                                       2._wp/6._wp & ! Index 1
+                                       ]
 #endif
         #:endif
 
@@ -224,36 +217,37 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION
             if (igr_order == 3) then
-                vidxb = -1; vidxe = 2; offxL = 1; offxR = 2
-                $:GPU_UPDATE(device='[vidxb, vidxe, offxL, offxR]')
+                vidxb = -1; vidxe = 2;
+                $:GPU_UPDATE(device='[vidxb, vidxe]')
+
+                 @:ALLOCATE(coeff_L(0:2))
+                coeff_L(0) = (2._wp/6._wp)
+                coeff_L(1) = (5._wp/6._wp)
+                coeff_L(2) = (-1._wp/6._wp)
 
-                @:ALLOCATE(coeff_L(1:3))
-                coeff_L(1) = (2._wp/6._wp)
-                coeff_L(2) = (5._wp/6._wp)
-                coeff_L(3) = (-1._wp/6._wp)
+                @:ALLOCATE(coeff_R(-1:1))
+                coeff_R(1) = (2._wp/6._wp)
+                coeff_R(0) = (5._wp/6._wp)
+                coeff_R(-1) = (-1._wp/6._wp)
 
-                @:ALLOCATE(coeff_R(1:3))
-                coeff_R(3) = (2._wp/6._wp)
-                coeff_R(2) = (5._wp/6._wp)
-                coeff_R(1) = (-1._wp/6._wp)
 
             elseif (igr_order == 5) then
-                vidxb = -2; vidxe = 3; offxL = 2; offxR = 3
-                $:GPU_UPDATE(device='[vidxb, vidxe, offxL, offxR]')
-
-                @:ALLOCATE(coeff_L(1:5))
-                coeff_L(1) = (-3._wp/60._wp)
-                coeff_L(2) = (27._wp/60._wp)
-                coeff_L(3) = (47._wp/60._wp)
-                coeff_L(4) = (-13._wp/60._wp)
-                coeff_L(5) = (2._wp/60._wp)
-
-                @:ALLOCATE(coeff_R(1:5))
-                coeff_R(5) = (-3._wp/60._wp)
-                coeff_R(4) = (27._wp/60._wp)
-                coeff_R(3) = (47._wp/60._wp)
-                coeff_R(2) = (-13._wp/60._wp)
-                coeff_R(1) = (2._wp/60._wp)
+                vidxb = -2; vidxe = 3;
+                $:GPU_UPDATE(device='[vidxb, vidxe]')
+
+                @:ALLOCATE(coeff_L(-1:3))
+                coeff_L(-1) = (-3._wp/60._wp)
+                coeff_L(0) = (27._wp/60._wp)
+                coeff_L(1) = (47._wp/60._wp)
+                coeff_L(2) = (-13._wp/60._wp)
+                coeff_L(3) = (2._wp/60._wp)
+
+                @:ALLOCATE(coeff_R(-2:2))
+                coeff_R(2) = (-3._wp/60._wp)
+                coeff_R(1) = (27._wp/60._wp)
+                coeff_R(0) = (47._wp/60._wp)
+                coeff_R(-1) = (-13._wp/60._wp)
+                coeff_R(-2) = (2._wp/60._wp)
             end if
 
             $:GPU_UPDATE(device='[coeff_L]')
@@ -408,22 +402,22 @@ contains
                     do q = vidxb + 1, vidxe
                         $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
-                            alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j + q, k, l)
+                            alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l)
                         end do
 
-                        vel_L = vel_L + coeff_L(q + offxL)*q_cons_vf(momxb)%sf(j + q, k, l)
-                        F_L = F_L + coeff_L(q + offxL)*jac(j + q, k, l)
+                        vel_L = vel_L + coeff_L(q)*q_cons_vf(momxb)%sf(j + q, k, l)
+                        F_L = F_L + coeff_L(q)*jac(j + q, k, l)
                     end do
 
                     $:GPU_LOOP(parallelism='[seq]')
                     do q = vidxb, vidxe - 1
                         $:GPU_LOOP(parallelism='[seq]')
                         do i = 1, num_fluids
-                            alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j + q, k, l)
+                            alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l)
                         end do
 
-                        vel_R = vel_R + coeff_R(q + offxR)*q_cons_vf(momxb)%sf(j + q, k, l)
-                        F_R = F_R + coeff_R(q + offxR)*jac(j + q, k, l)
+                        vel_R = vel_R + coeff_R(q)*q_cons_vf(momxb)%sf(j + q, k, l)
+                        F_R = F_R + coeff_R(q)*jac(j + q, k, l)
                     end do
 
                     $:GPU_LOOP(parallelism='[seq]')
@@ -531,12 +525,12 @@ contains
                                 end if
 
                                 if (q > vidxb) then
-                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(2))
-                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(4._wp*dvel_small(1))/3._wp
+                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp
                                 end if
                                 if (q < vidxe) then
-                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(2))
-                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(4._wp*dvel_small(1))/3._wp
+                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp
                                 end if
 
                                 !y-direction contributions
@@ -565,12 +559,12 @@ contains
                                 end if
 
                                 if (q > vidxb) then
-                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(1))
-                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(2))/3._wp
+                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp
                                 end if
                                 if (q < vidxe) then
-                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(1))
-                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(2))/3._wp
+                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp
                                 end if
 
                                 if (q == 0) then
@@ -597,13 +591,13 @@ contains
                             do q = vidxb + 1, vidxe
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j + q, k, l)
+                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_L(i) = alpha_L(i) + coeff_L(q + offxL)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                        alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
                                     end do
                                 else
                                     alpha_L(1) = 1._wp
@@ -611,7 +605,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_L(i) = vel_L(i) + coeff_L(q + offxL)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                    vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
                                 end do
                             end do
 
@@ -619,13 +613,13 @@ contains
                             do q = vidxb, vidxe - 1
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j + q, k, l)
+                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_R(i) = alpha_R(i) + coeff_R(q + offxR)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                        alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
                                     end do
                                 else
                                     alpha_R(1) = 1._wp
@@ -633,7 +627,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_R(i) = vel_R(i) + coeff_R(q + offxR)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                    vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
                                 end do
                             end do
 
@@ -739,12 +733,12 @@ contains
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb + 1, vidxe
-                                E_L = E_L + coeff_L(q + offxL)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
                             end do
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb, vidxe - 1
-                                E_R = E_R + coeff_R(q + offxR)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
                             end do
 
                             call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
@@ -952,14 +946,14 @@ contains
                                     end if
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(2))
-                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(3))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(4._wp*dvel_small(1))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(2))
-                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(3))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(4._wp*dvel_small(1))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp
                                     end if
 
                                     !y-direction contributions
@@ -990,12 +984,12 @@ contains
                                     end if
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(1))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(2))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(1))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(2))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp
                                     end if
 
                                     !z-direction contributions
@@ -1026,12 +1020,12 @@ contains
                                     end if
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(1))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(3))/3._wp
+                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(1))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(1))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(3))/3._wp
+                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(1))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp
                                     end if
 
                                     if (q == 0) then
@@ -1061,13 +1055,13 @@ contains
                                 do q = vidxb + 1, vidxe
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
-                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j + q, k, l)
+                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l)
                                     end do
 
                                     if (num_fluids > 1) then
                                         $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, num_fluids - 1
-                                            alpha_L(i) = alpha_L(i) + coeff_L(q + offxL)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                            alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
                                         end do
                                     else
                                         alpha_L(1) = 1._wp
@@ -1075,7 +1069,7 @@ contains
 
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
-                                        vel_L(i) = vel_L(i) + coeff_L(q + offxL)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                        vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
                                     end do
                                 end do
 
@@ -1083,13 +1077,13 @@ contains
                                 do q = vidxb, vidxe - 1
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
-                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j + q, k, l)
+                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l)
                                     end do
 
                                     if (num_fluids > 1) then
                                         $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, num_fluids - 1
-                                            alpha_R(i) = alpha_R(i) + coeff_R(q + offxR)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                            alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
                                         end do
                                     else
                                         alpha_R(1) = 1._wp
@@ -1097,7 +1091,7 @@ contains
 
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
-                                        vel_R(i) = vel_R(i) + coeff_R(q + offxR)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                        vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
                                     end do
                                 end do
 
@@ -1232,12 +1226,12 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do q = vidxb + 1, vidxe
-                                    E_L = E_L + coeff_L(q + offxL)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                    E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
                                 end do
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do q = vidxb, vidxe - 1
-                                    E_R = E_R + coeff_R(q + offxR)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                    E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
                                 end do
 
                                 call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
@@ -1459,12 +1453,12 @@ contains
                                                     q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1))
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(2))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(1))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(2))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(1))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp
                                     end if
 
                                     !y-direction contributions
@@ -1486,12 +1480,12 @@ contains
                                                     q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(1))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(4._wp*dvel_small(2))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(1))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(4._wp*dvel_small(2))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp
                                     end if
                                 end do
                             end if
@@ -1513,13 +1507,13 @@ contains
                             do q = vidxb + 1, vidxe
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j, k + q, l)
+                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_L(i) = alpha_L(i) + coeff_L(q + offxL)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                        alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
                                     end do
                                 else
                                     alpha_L(1) = 1._wp
@@ -1527,7 +1521,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_L(i) = vel_L(i) + coeff_L(q + offxL)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
                                 end do
                             end do
 
@@ -1535,13 +1529,13 @@ contains
                             do q = vidxb, vidxe - 1
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j, k + q, l)
+                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_R(i) = alpha_R(i) + coeff_R(q + offxR)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                        alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
                                     end do
                                 else
                                     alpha_R(1) = 1._wp
@@ -1549,7 +1543,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_R(i) = vel_R(i) + coeff_R(q + offxR)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
                                 end do
                             end do
 
@@ -1657,14 +1651,14 @@ contains
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb + 1, vidxe
-                                E_L = E_L + coeff_L(q + offxL)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                F_L = F_L + coeff_L(q + offxL)*jac(j, k + q, l)
+                                E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                F_L = F_L + coeff_L(q)*jac(j, k + q, l)
                             end do
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb, vidxe - 1
-                                E_R = E_R + coeff_R(q + offxR)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                F_R = F_R + coeff_R(q + offxR)*jac(j, k + q, l)
+                                E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                F_R = F_R + coeff_R(q)*jac(j, k + q, l)
                             end do
 
                             call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
@@ -1856,12 +1850,12 @@ contains
                                                         q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1))
 
                                         if (q > vidxb) then
-                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(2))
-                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(1))/3._wp
+                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp
                                         end if
                                         if (q < vidxe) then
-                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(2))
-                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(1))/3._wp
+                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp
                                         end if
 
                                         !y-direction contributions
@@ -1886,14 +1880,14 @@ contains
                                                         q_cons_vf(momxb + 2)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
 
                                         if (q > vidxb) then
-                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(1))
-                                            vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(3))
-                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(4._wp*dvel_small(2))/3._wp
+                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                            vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3))
+                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp
                                         end if
                                         if (q < vidxe) then
-                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(1))
-                                            vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(3))
-                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(4._wp*dvel_small(2))/3._wp
+                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                            vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3))
+                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp
                                         end if
 
                                         !z-direction contributions
@@ -1914,12 +1908,12 @@ contains
                                                         q_cons_vf(momxb + 2)%sf(j, k + q, l + 1)/rho_sf_small(1) - &
                                                         q_cons_vf(momxb + 2)%sf(j, k + q, l - 1)/rho_sf_small(-1))
                                         if (q > vidxb) then
-                                            vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(2))
-                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(3))/3._wp
+                                            vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(2))
+                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(3))/3._wp
                                         end if
                                         if (q < vidxe) then
-                                            vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(2))
-                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(3))/3._wp
+                                            vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(2))
+                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(3))/3._wp
                                         end if
                                     end do
                                 end if
@@ -1941,13 +1935,13 @@ contains
                                 do q = vidxb + 1, vidxe
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
-                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j, k + q, l)
+                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l)
                                     end do
 
                                     if (num_fluids > 1) then
                                         $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, num_fluids - 1
-                                            alpha_L(i) = alpha_L(i) + coeff_L(q + offxL)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                            alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
                                         end do
                                     else
                                         alpha_L(1) = 1._wp
@@ -1955,7 +1949,7 @@ contains
 
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
-                                        vel_L(i) = vel_L(i) + coeff_L(q + offxL)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                        vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
                                     end do
                                 end do
 
@@ -1963,13 +1957,13 @@ contains
                                 do q = vidxb, vidxe - 1
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids
-                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j, k + q, l)
+                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l)
                                     end do
 
                                     if (num_fluids > 1) then
                                         $:GPU_LOOP(parallelism='[seq]')
                                         do i = 1, num_fluids - 1
-                                            alpha_R(i) = alpha_R(i) + coeff_R(q + offxR)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                            alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
                                         end do
                                     else
                                         alpha_R(1) = 1._wp
@@ -1977,7 +1971,7 @@ contains
 
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
-                                        vel_R(i) = vel_R(i) + coeff_R(q + offxR)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                        vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
                                     end do
                                 end do
 
@@ -2113,14 +2107,14 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do q = vidxb + 1, vidxe
-                                    E_L = E_L + coeff_L(q + offxL)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                    F_L = F_L + coeff_L(q + offxL)*jac(j, k + q, l)
+                                    E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                    F_L = F_L + coeff_L(q)*jac(j, k + q, l)
                                 end do
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do q = vidxb, vidxe - 1
-                                    E_R = E_R + coeff_R(q + offxR)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                    F_R = F_R + coeff_R(q + offxR)*jac(j, k + q, l)
+                                    E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                    F_R = F_R + coeff_R(q)*jac(j, k + q, l)
                                 end do
 
                                 call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
@@ -2341,12 +2335,12 @@ contains
                                                     q_cons_vf(momxb + 2)%sf(j - 1, k, l + q)/rho_sf_small(-1))
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(3))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(1))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(3))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(3))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(1))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(3))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp
                                     end if
 
                                     !y-direction contributions
@@ -2368,12 +2362,12 @@ contains
                                                     q_cons_vf(momxb + 2)%sf(j, k - 1, l + q)/rho_sf_small(-1))
 
                                     if (q > vidxb) then
-                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(3))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(-2._wp*dvel_small(2))/3._wp
+                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(3))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(3))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(-2._wp*dvel_small(2))/3._wp
+                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(3))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp
                                     end if
 
                                     !z-direction contributions
@@ -2396,14 +2390,14 @@ contains
                                                     q_cons_vf(momxb + 2)%sf(j, k, l + 1 + q)/rho_sf_small(1) - &
                                                     q_cons_vf(momxb + 2)%sf(j, k, l - 1 + q)/rho_sf_small(-1))
                                     if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q + offxL)*(dvel_small(1))
-                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q + offxL)*(dvel_small(2))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q + offxL)*(4._wp*dvel_small(3))/3._wp
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                        vflux_L_arr(2) = vflux_L_arr(2) + coeff_L(q)*(dvel_small(2))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(3))/3._wp
                                     end if
                                     if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q + offxR)*(dvel_small(1))
-                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q + offxR)*(dvel_small(2))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q + offxR)*(4._wp*dvel_small(3))/3._wp
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                        vflux_R_arr(2) = vflux_R_arr(2) + coeff_R(q)*(dvel_small(2))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(3))/3._wp
                                     end if
                                 end do
                             end if
@@ -2426,13 +2420,13 @@ contains
                             do q = vidxb + 1, vidxe
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q + offxL)*q_cons_vf(i)%sf(j, k, l + q)
+                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k, l + q)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_L(i) = alpha_L(i) + coeff_L(q + offxL)*q_cons_vf(E_idx + i)%sf(j, k, l + q)
+                                        alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q)
                                     end do
                                 else
                                     alpha_L(1) = 1._wp
@@ -2440,7 +2434,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_L(i) = vel_L(i) + coeff_L(q + offxL)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q)
+                                    vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q)
                                 end do
                             end do
 
@@ -2448,13 +2442,13 @@ contains
                             do q = vidxb, vidxe - 1
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q + offxR)*q_cons_vf(i)%sf(j, k, l + q)
+                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k, l + q)
                                 end do
 
                                 if (num_fluids > 1) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_R(i) = alpha_R(i) + coeff_R(q + offxR)*q_cons_vf(E_idx + i)%sf(j, k, l + q)
+                                        alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k, l + q)
                                     end do
                                 else
                                     alpha_R(1) = 1._wp
@@ -2462,7 +2456,7 @@ contains
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_R(i) = vel_R(i) + coeff_R(q + offxR)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q)
+                                    vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k, l + q)
                                 end do
                             end do
 
@@ -2598,14 +2592,14 @@ contains
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb + 1, vidxe
-                                E_L = E_L + coeff_L(q + offxL)*q_cons_vf(E_idx)%sf(j, k, l + q)
-                                F_L = F_L + coeff_L(q + offxL)*jac(j, k, l + q)
+                                E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k, l + q)
+                                F_L = F_L + coeff_L(q)*jac(j, k, l + q)
                             end do
 
                             $:GPU_LOOP(parallelism='[seq]')
                             do q = vidxb, vidxe - 1
-                                E_R = E_R + coeff_R(q + offxR)*q_cons_vf(E_idx)%sf(j, k, l + q)
-                                F_R = F_R + coeff_R(q + offxR)*jac(j, k, l + q)
+                                E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k, l + q)
+                                F_R = F_R + coeff_R(q)*jac(j, k, l + q)
                             end do
 
                             call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index c78fbe90ae..d87eab8898 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -146,7 +146,11 @@ contains
         integer, intent(in) :: j, k, l
 
         real(wp) :: pres_relax, f_pres, df_pres
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(3) :: pres_K_init, rho_K_s
+#:else 
         real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+#:endif
         integer, parameter :: MAX_ITER = 50
         real(wp), parameter :: TOLERANCE = 1.e-10_wp
         integer :: iter, i
@@ -213,8 +217,11 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(2) :: alpha_rho, alpha
+#:else 
         real(wp), dimension(num_fluids) :: alpha_rho, alpha
+#:endif
         real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
         real(wp), dimension(2) :: Re
         integer :: i, q
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index f1f5453720..2f2dd46ed5 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -571,11 +571,15 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
+#:else
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
+#:endif
 
         integer :: i1, i2
 
-        coeffs = 0._wp
+        coeffs(:, :, :) = 0._wp
 
         do i2 = 0, 2; do i1 = 0, 2
                 if ((i1 + i2) <= 2) then
@@ -646,17 +650,21 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
+#:else
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
+#:endif
 
         integer :: i1, i2
 
-        coeffs = 0._wp
+        coeffs(:, :, :) = 0._wp
 
         do i2 = 0, 2; do i1 = 0, 2
                 if ((i1 + i2) <= 2) then
                     if (bubble_model == 3) then
                         ! RPE
-                        #:if not MFC_CASE_OPTIMIZATION or nterms > 7
+                        #:if not MFC_CASE_OPTIMIZATION or nterms > 1
                             coeffs(1, i1, i2) = -1._wp*i2*pres/rho
                             coeffs(2, i1, i2) = -3._wp*i2/2._wp
                             coeffs(3, i1, i2) = i2/rho
@@ -715,10 +723,15 @@ contains
         real(stp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: rhs_mv
         type(int_bounds_info), intent(in) :: ix, iy, iz
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(6) :: moms, msum
+        real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
+        real(wp), dimension(32, 0:2, 0:2) :: coeff
+#:else 
         real(wp), dimension(nmom) :: moms, msum
         real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
         real(wp), dimension(nterms, 0:2, 0:2) :: coeff
+#:endif
         real(wp) :: pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, grad_T
         real(wp) :: n_tait, B_tait
         integer :: id1, id2, id3, i1, i2, j, q, r
@@ -871,7 +884,11 @@ contains
             $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', &
                 & cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
+#:else
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
+#:endif
             logical, intent(in) :: polytropic
             if (polytropic) then
                 call s_coeff(pres, rho, c, coeff)
@@ -960,15 +977,24 @@ contains
 
         function f_quad(abscX, abscY, wght_in, q, r, s)
             $:GPU_ROUTINE(parallelism='[seq]')
-            real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
+#:else
+            real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in 
+#:endif
             real(wp), intent(in) :: q, r, s
 
             real(wp) :: f_quad_RV, f_quad
-            integer :: i
+            integer :: i, i1
 
             f_quad = 0._wp
+            $:GPU_LOOP(parallelism='[seq]')
             do i = 1, nb
-                f_quad_RV = sum(wght_in(:, i)*(abscX(:, i)**q)*(abscY(:, i)**r))
+                f_quad_RV = 0._wp
+                $:GPU_LOOP(parallelism='[seq]')
+                do i1 = 1, nnode
+                    f_quad_RV = f_quad_RV + wght_in(i1, i)*(abscX(i1, i)**q)*(abscY(i1, i)**r)
+                end do
                 f_quad = f_quad + weight(i)*(R0(i)**s)*f_quad_RV
             end do
 
@@ -976,12 +1002,21 @@ contains
 
         function f_quad2D(abscX, abscY, wght_in, pow)
             $:GPU_ROUTINE(parallelism='[seq]')
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
+#:else 
             real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
+#:endif
             real(wp), dimension(3), intent(in) :: pow
 
             real(wp) :: f_quad2D
+            integer :: i
 
-            f_quad2D = sum(wght_in(:)*(abscX(:)**pow(1))*(abscY(:)**pow(2)))
+            f_quad2D = 0._wp
+            $:GPU_LOOP(parallelism='[seq]')
+            do i = 1, nnode
+                f_quad2D = f_quad2D + wght_in(i)*(abscX(i)**pow(1))*(abscY(i)**pow(2))
+            end do
         end function f_quad2D
 
     end subroutine s_mom_inv
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 1c8eda1c30..9689be4db0 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -339,7 +339,7 @@ contains
             ! END: Allocation/Association of flux_n, flux_src_n, and flux_gsrc_n
         end if
 
-        if (.not. igr) then
+        if ((.not. igr) .or. dummy) then
 
             ! Allocation of dq_prim_ds_qp
             @:ALLOCATE(dq_prim_dx_qp(1:1))
@@ -781,7 +781,7 @@ contains
                     call nvtxEndRange
                 end if
             end if 
-            if(.not. igr .or. dummy) then! Finite volume solve
+            if((.not. igr) .or. dummy) then! Finite volume solve
 
                 ! Reconstructing Primitive/Conservative Variables
                 call nvtxStartRange("RHS-WENO")
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index fc5cefadaf..d8b5792fe0 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -49,6 +49,10 @@ module m_riemann_solvers
         get_species_specific_heats_r, get_species_enthalpies_rt, &
         get_mixture_specific_heat_cp_mass
 
+    #:if USING_AMD
+        use m_chemistry, only: molecular_weights_nonparameter
+    #:endif
+
     implicit none
 
     private; public :: s_initialize_riemann_solvers_module, &
@@ -287,17 +291,25 @@ contains
 
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+        real(wp), dimension(3) :: vel_L, vel_R
+        real(wp), dimension(3) :: alpha_L, alpha_R
+        real(wp), dimension(10) :: Ys_L, Ys_R
+        real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+#:else
         real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp) :: rho_L, rho_R
         real(wp), dimension(num_vels) :: vel_L, vel_R
-        real(wp) :: pres_L, pres_R
-        real(wp) :: E_L, E_R
-        real(wp) :: H_L, H_R
         real(wp), dimension(num_fluids) :: alpha_L, alpha_R
         real(wp), dimension(num_species) :: Ys_L, Ys_R
         real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
         real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+#:endif
+        real(wp) :: rho_L, rho_R     
+        real(wp) :: pres_L, pres_R
+        real(wp) :: E_L, E_R
+        real(wp) :: H_L, H_R
         real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
         real(wp) :: T_L, T_R
         real(wp) :: Y_L, Y_R
@@ -481,9 +493,13 @@ contains
 
                                 call get_mixture_molecular_weight(Ys_L, MW_L)
                                 call get_mixture_molecular_weight(Ys_R, MW_R)
-
+#:if USING_AMD
+                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+#:else
                                 Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
+#:endif
 
                                 R_gas_L = gas_constant/MW_L
                                 R_gas_R = gas_constant/MW_R
@@ -1052,17 +1068,28 @@ contains
 
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+        real(wp), dimension(3) :: vel_L, vel_R
+        real(wp), dimension(3) :: alpha_L, alpha_R
+        real(wp), dimension(10) :: Ys_L, Ys_R
+        real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+         real(wp), dimension(3, 3) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+#:else 
         real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp) :: rho_L, rho_R
         real(wp), dimension(num_vels) :: vel_L, vel_R
-        real(wp) :: pres_L, pres_R
-        real(wp) :: E_L, E_R
-        real(wp) :: H_L, H_R
         real(wp), dimension(num_fluids) :: alpha_L, alpha_R
         real(wp), dimension(num_species) :: Ys_L, Ys_R
         real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
         real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+#:endif
+        real(wp) :: rho_L, rho_R
+       
+        real(wp) :: pres_L, pres_R
+        real(wp) :: E_L, E_R
+        real(wp) :: H_L, H_R
         real(wp) :: Cp_avg, Cv_avg, T_avg, eps, c_sum_Yi_Phi
         real(wp) :: T_L, T_R
         real(wp) :: Y_L, Y_R
@@ -1104,8 +1131,6 @@ contains
         type(riemann_states_vec3) :: cm ! Conservative momentum variables
 
         integer :: i, j, k, l, q !< Generic loop iterators
-
-        real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
         integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
 
         ! Populating the buffers of the left and right Riemann problem
@@ -1249,8 +1274,13 @@ contains
                                 call get_mixture_molecular_weight(Ys_L, MW_L)
                                 call get_mixture_molecular_weight(Ys_R, MW_R)
 
+#:if USING_AMD
+                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+#:else
                                 Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
+#:endif
 
                                 R_gas_L = gas_constant/MW_L
                                 R_gas_R = gas_constant/MW_R
@@ -1943,8 +1973,8 @@ contains
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(2) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(2) :: alpha_L, alpha_R
+        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+        real(wp), dimension(3) :: alpha_L, alpha_R
         real(wp), dimension(3) :: vel_L, vel_R
 #:else
         real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
@@ -1956,8 +1986,13 @@ contains
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
+        real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
+        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+#:else
         real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
         real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+#:endif
         real(wp) :: Cp_avg, Cv_avg, T_avg, c_sum_Yi_Phi, eps
         real(wp) :: T_L, T_R
         real(wp) :: MW_L, MW_R
@@ -3227,8 +3262,13 @@ contains
                                     call get_mixture_molecular_weight(Ys_L, MW_L)
                                     call get_mixture_molecular_weight(Ys_R, MW_R)
 
+#:if USING_AMD
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+#:else
                                     Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
+#:endif
 
                                     R_gas_L = gas_constant/MW_L
                                     R_gas_R = gas_constant/MW_R
@@ -3680,7 +3720,11 @@ contains
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
         ! Local variables:
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+#:else 
         real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+#:endif
         type(riemann_states_vec3) :: vel
         type(riemann_states) :: rho, pres, E, H_no_mag
         type(riemann_states) :: gamma, pi_inf, qv
@@ -4638,16 +4682,24 @@ contains
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
         ! Local variables
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
+        real(wp), dimension(3) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
+        real(wp), dimension(3) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
+        real(wp), dimension(3) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
+        real(wp), dimension(3) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
+        real(wp), dimension(3) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
+#:else 
         real(wp), dimension(num_dims) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
         real(wp), dimension(num_dims) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
         real(wp), dimension(num_dims) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
         real(wp), dimension(num_dims) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
-
+        real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
         real(wp), dimension(num_dims) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
+#:endif
         real(wp) :: stress_normal_bulk  !!< Normal bulk stress component $\sigma_{NN}$ on N-face.
 
         real(wp) :: Re_s, Re_b        !!< Effective interface shear and bulk Reynolds numbers.
-        real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
         real(wp) :: r_eff             !!< Effective radius at interface for cylindrical terms.
         real(wp) :: div_v_term_const  !!< Common term $-(2/3)(\nabla \cdot \mathbf{v}) / \text{Re}_s$ for shear stress diagonal.
         real(wp) :: divergence_cyl    !!< Full divergence $\nabla \cdot \mathbf{v}$ in cylindrical coordinates.
@@ -4813,10 +4865,17 @@ contains
         integer, intent(in) :: norm_dir
 
         ! Local variables
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3, 3) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+        real(wp), dimension(3, 3) :: current_tau_shear   !< Current shear stress tensor.
+        real(wp), dimension(3, 3) :: current_tau_bulk    !< Current bulk stress tensor.
+        real(wp), dimension(3) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+#:else 
         real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
         real(wp), dimension(num_dims, num_dims) :: current_tau_shear   !< Current shear stress tensor.
         real(wp), dimension(num_dims, num_dims) :: current_tau_bulk    !< Current bulk stress tensor.
         real(wp), dimension(num_dims) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+#:endif
         integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
 
         real(wp) :: Re_shear !< Interface shear Reynolds number.
@@ -4929,10 +4988,15 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
 
         ! Arguments
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3, 3), intent(in) :: vel_grad_avg
+        real(wp), dimension(3, 3), intent(out) :: tau_shear_out
+#:else 
         real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg
+        real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+#:endif
         real(wp), intent(in) :: Re_shear
         real(wp), intent(in) :: divergence_v
-        real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
 
         ! Local variables
         integer :: i_dim !< Loop iterator for face normal.
@@ -4963,7 +5027,11 @@ contains
         ! Arguments
         real(wp), intent(in) :: Re_bulk
         real(wp), intent(in) :: divergence_v
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
+#:else 
         real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+#:endif
 
         ! Local variables
         integer :: i_dim !< Loop iterator for diagonal components.
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 873a614cbb..64f9fd427d 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -98,7 +98,7 @@ contains
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), intent(inout), dimension(2) :: alpha
+        real(wp), intent(inout), dimension(3) :: alpha
         real(wp), intent(inout), dimension(3) :: vel
 #:else 
         real(wp), intent(inout), dimension(num_fluids) :: alpha
@@ -109,7 +109,7 @@ contains
         integer, intent(in) :: j, k, l
         real(wp), dimension(2), intent(inout) :: Re
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(2) :: alpha_rho, Gs
+        real(wp), dimension(3) :: alpha_rho, Gs
 #:else
         real(wp), dimension(num_fluids) :: alpha_rho, Gs
 #:endif
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index d51b67f659..3afa1bd4b2 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -82,8 +82,11 @@ contains
             intent(inout) :: flux_src_vf
         integer, intent(in) :: id
         type(int_bounds_info), intent(in) :: isx, isy, isz
-
-        real(wp), dimension(num_dims, num_dims) :: Omega
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3, 3) :: Omega
+#:else
+        real(wp), dimension(num_dims, num_dims) :: Omega 
+#:endif
         real(wp) :: w1L, w1R, w2L, w2R, w3L, w3R, w1, w2, w3
         real(wp) :: normWL, normWR, normW
         integer :: j, k, l, i
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index cd4be44028..c34792f554 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -463,14 +463,18 @@ contains
 
         @:ALLOCATE(bc_type(1,1)%sf(0:0,0:n,0:p))
         @:ALLOCATE(bc_type(1,2)%sf(0:0,0:n,0:p))
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
         if (n > 0) then
             @:ALLOCATE(bc_type(2,1)%sf(-buff_size:m+buff_size,0:0,0:p))
             @:ALLOCATE(bc_type(2,2)%sf(-buff_size:m+buff_size,0:0,0:p))
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
             if (p > 0) then
                 @:ALLOCATE(bc_type(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
                 @:ALLOCATE(bc_type(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
             end if
+            #:endif
         end if
+        #:endif
 
         do i = 1, num_dims
             do j = 1, 2
@@ -718,10 +722,15 @@ contains
     impure subroutine s_compute_dt()
 
         real(wp) :: rho        !< Cell-avg. density
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+        real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
+#:else 
         real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
-        real(wp) :: vel_sum    !< Cell-avg. velocity sum
-        real(wp) :: pres       !< Cell-avg. pressure
         real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
+#:endif
+        real(wp) :: vel_sum    !< Cell-avg. velocity sum
+        real(wp) :: pres       !< Cell-avg. pressure   
         real(wp) :: gamma      !< Cell-avg. sp. heat ratio
         real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
         real(wp) :: qv         !< Cell-avg. fluid reference energy
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index f3fc59ee43..3cd6099539 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -72,9 +72,13 @@ contains
 
         real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
         real(wp), dimension(2) :: Re_visc
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_visc, alpha_rho_visc
+        real(wp), dimension(3, 3) :: tau_Re
+#:else 
         real(wp), dimension(num_fluids) :: alpha_visc, alpha_rho_visc
-
         real(wp), dimension(num_dims, num_dims) :: tau_Re
+#:endif
 
         integer :: i, j, k, l, q !< Generic loop iterator
 
@@ -97,7 +101,7 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             if (shear_stress) then    ! Shear stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = -1, 1
                         do j = is1_viscous%beg, is1_viscous%end
@@ -207,7 +211,7 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
             if (bulk_stress) then    ! Bulk stresses
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[i,j,k,l,rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum ,alpha_visc, alpha_rho_visc, Re_visc, tau_Re]')
                 do l = is3_viscous%beg, is3_viscous%end
                     do k = -1, 1
                         do j = is1_viscous%beg, is1_viscous%end
@@ -1158,56 +1162,54 @@ contains
                                        recon_dir, &
                                        is1_viscous, is2_viscous, is3_viscous)
                 end if
+            end if
+        #:endfor
 
-                if (viscous .or. dummy) then
-                    #:if not MFC_CASE_OPTIMIZATION or viscous
-                    if (weno_Re_flux) then
-                        if (norm_dir == 2) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    do j = is1_viscous%beg, is1_viscous%end
-                                        do k = is2_viscous%beg, is2_viscous%end
-                                            vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
-                                            vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
-                                        end do
-                                    end do
+        if (viscous .or. dummy) then
+            if (weno_Re_flux) then
+                if (norm_dir == 2) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do j = is1_viscous%beg, is1_viscous%end
+                                do k = is2_viscous%beg, is2_viscous%end
+                                    vL_prim_vf(i)%sf(k, j, l) = vL_y(j, k, l, i)
+                                    vR_prim_vf(i)%sf(k, j, l) = vR_y(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        elseif (norm_dir == 3) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do j = is1_viscous%beg, is1_viscous%end
-                                    do k = is2_viscous%beg, is2_viscous%end
-                                        do l = is3_viscous%beg, is3_viscous%end
-                                            vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
-                                            vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
-                                        end do
-                                    end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 3) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do j = is1_viscous%beg, is1_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do l = is3_viscous%beg, is3_viscous%end
+                                    vL_prim_vf(i)%sf(l, k, j) = vL_z(j, k, l, i)
+                                    vR_prim_vf(i)%sf(l, k, j) = vR_z(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        elseif (norm_dir == 1) then
-                            $:GPU_PARALLEL_LOOP(collapse=4)
-                            do i = iv%beg, iv%end
-                                do l = is3_viscous%beg, is3_viscous%end
-                                    do k = is2_viscous%beg, is2_viscous%end
-                                        do j = is1_viscous%beg, is1_viscous%end
-                                            vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
-                                            vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
-                                        end do
-                                    end do
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
+                elseif (norm_dir == 1) then
+                    $:GPU_PARALLEL_LOOP(collapse=4)
+                    do i = iv%beg, iv%end
+                        do l = is3_viscous%beg, is3_viscous%end
+                            do k = is2_viscous%beg, is2_viscous%end
+                                do j = is1_viscous%beg, is1_viscous%end
+                                    vL_prim_vf(i)%sf(j, k, l) = vL_x(j, k, l, i)
+                                    vR_prim_vf(i)%sf(j, k, l) = vR_x(j, k, l, i)
                                 end do
                             end do
-                            $:END_GPU_PARALLEL_LOOP()
-                        end if
-                    end if
-                    #:endif
+                        end do
+                    end do
+                    $:END_GPU_PARALLEL_LOOP()
                 end if
             end if
+        end if
 
-        #:endfor
     end subroutine s_reconstruct_cell_boundary_values_visc_deriv
 
     !>  The purpose of this subroutine is to employ the inputted
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index 6e8b668326..ff6a0e5f93 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -665,7 +665,7 @@ contains
         real(wp), dimension(-3:3) :: v ! temporary field value array for clarity (WENO7 only)
         real(wp) :: tau
 
-        integer :: i, j, k, l
+        integer :: i, j, k, l, q
 
         is1_weno = is1_weno_d
         is2_weno = is2_weno_d
@@ -808,7 +808,7 @@ contains
             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 1
                 #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                     if (weno_dir == ${WENO_DIR}$) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta]')
+                        $:GPU_PARALLEL_LOOP(collapse=3,private='[dvd,poly,beta,alpha,omega,tau,delta,q]')
                         do l = is3_weno%beg, is3_weno%end
                             do k = is2_weno%beg, is2_weno%end
                                 do j = is1_weno%beg, is1_weno%end
@@ -816,6 +816,11 @@ contains
                                     do i = 1, v_size
                                         ! reconstruct from left side
 
+                                        alpha(:) = 0._wp 
+                                        omega(:) = 0._wp 
+                                        delta(:) = 0._wp
+                                        beta(:) = weno_eps
+
                                         dvd(1) = v_rs_ws_${XYZ}$ (j + 2, k, l, i) &
                                                  - v_rs_ws_${XYZ}$ (j + 1, k, l, i)
                                         dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) &
@@ -862,23 +867,36 @@ contains
                                             ! Borges, et al. (2008)
 
                                             tau = abs(beta(2) - beta(0))                   ! Equation 25
-                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))  ! Equation 28 (note: weno_eps was already added to beta)
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))) ! Equation 28 (note: weno_eps was already added to beta)
+                                            end do 
 
                                         elseif (teno) then
                                              ! Fu, et al. (2016)
                                             ! Fu''s code: https://dx.doi.org/10.13140/RG.2.2.36250.34247
                                             tau = abs(beta(2) - beta(0))
-                                            alpha = 1._wp + tau/beta                    ! Equation 22 (reuse alpha as gamma; pick C=1 & q=6)
-                                            alpha = (alpha*alpha*alpha)**2._wp          ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0)
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = 1._wp + tau/beta(q)                    ! Equation 22 (reuse alpha as gamma; pick C=1 & q=6)
+                                                alpha(q) = (alpha(q)**3._wp)**2._wp          ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0)
+                                            end do
                                             omega = alpha/sum(alpha)                    ! Equation 25 (reuse omega as xi)
-                                            delta = merge(0._wp, 1._wp, omega < teno_CT)! Equation 26
-                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbL_${XYZ}$ (0:weno_num_stencils, j)    ! Equation 27
-
+                                            
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                if(omega(q) < teno_CT) then             ! Equation 26
+                                                    delta(q) = 0._wp 
+                                                else 
+                                                    delta(q) = 1._wp 
+                                                end if 
+                                                alpha(q) = delta(q)*d_cbL_${XYZ}$ (q, j) ! Equation 27
+                                            end do
                                         end if
 
                                         omega = alpha/sum(alpha)
 
-                                        vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly)
+                                        vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2)
 
                                         ! reconstruct from right side
 
@@ -904,16 +922,21 @@ contains
 
                                         elseif (wenoz) then
 
-                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))) 
+                                            end do 
 
                                         elseif (teno) then
-                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbR_${XYZ}$ (0:weno_num_stencils, j)
-
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j) 
+                                            end do
                                         end if
 
                                         omega = alpha/sum(alpha)
 
-                                        vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly)
+                                        vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2)
 
                                     end do
                                 end do
@@ -933,13 +956,18 @@ contains
             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 2
                 #:for WENO_DIR, XYZ in [(1, 'x'), (2, 'y'), (3, 'z')]
                     if (weno_dir == ${WENO_DIR}$) then
-                        $:GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v]')
+                        $:GPU_PARALLEL_LOOP(collapse=3,private='[poly,beta,alpha,omega,tau,delta,dvd,v,q]')
                         do l = is3_weno%beg, is3_weno%end
                             do k = is2_weno%beg, is2_weno%end
                                 do j = is1_weno%beg, is1_weno%end
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, v_size
 
+                                        alpha(:) = 0._wp 
+                                        omega(:) = 0._wp 
+                                        delta(:) = 0._wp
+                                        beta(:) = weno_eps
+
                                         if (teno) v = v_rs_ws_${XYZ}$ (j - 3:j + 3, k, l, i) ! temporary field value array for clarity
 
                                         if (.not. teno) then
@@ -1056,7 +1084,10 @@ contains
                                             ! Castro, et al. (2010)
                                             ! Don & Borges (2013) also helps
                                             tau = abs(beta(3) - beta(0)) ! Equation 50
-                                            alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + (tau/beta(0:weno_num_stencils))**wenoz_q) ! q = 2,3,4 for stability
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability
+                                            end do 
 
                                         elseif (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
@@ -1064,14 +1095,28 @@ contains
                                                 alpha = 1._wp + tau/beta
                                                 alpha = (alpha**3._wp)**2._wp ! some CPU compilers cannot optimize x**6.0
                                                 omega = alpha/sum(alpha)
-                                                delta = merge(0._wp, 1._wp, omega < teno_CT)
-                                                alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbL_${XYZ}$ (0:weno_num_stencils, j)
+                                                
+                                                $:GPU_LOOP(parallelism='[seq]')
+                                                do q = 0, weno_num_stencils
+                                                    if(omega(q) < teno_CT) then             ! Equation 26
+                                                        delta(q) = 0._wp 
+                                                    else 
+                                                        delta(q) = 1._wp 
+                                                    end if 
+                                                    alpha(q) = delta(q)*d_cbL_${XYZ}$ (q, j) ! Equation 27
+                                                end do
                                             #:endif
                                         end if
 
                                         omega = alpha/sum(alpha)
 
-                                        vL_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly)
+                                        vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2) + omega(3)*poly(3)
+
+                                        if(teno) then 
+                                            #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
+                                                vL_rs_vf_${XYZ}$ (j, k, l, i) = vL_rs_vf_${XYZ}$ (j, k, l, i) + omega(4)*poly(4)
+                                            #:endif
+                                        end if
 
                                         if (.not. teno) then
                                             poly(3) = v_rs_ws_${XYZ}$ (j, k, l, i) &
@@ -1111,16 +1156,27 @@ contains
 
                                         elseif (wenoz) then
 
-                                            alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + tau/beta(0:weno_num_stencils))
+                                           $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability
+                                            end do 
 
                                         elseif (teno) then
-                                            alpha(0:weno_num_stencils) = delta(0:weno_num_stencils)*d_cbR_${XYZ}$ (0:weno_num_stencils, j)
-
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do q = 0, weno_num_stencils
+                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j) 
+                                            end do
                                         end if
 
                                         omega = alpha/sum(alpha)
 
-                                        vR_rs_vf_${XYZ}$ (j, k, l, i) = sum(omega*poly)
+                                        vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2) + omega(3)*poly(3)
+
+                                        if(teno) then 
+                                            #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
+                                                vR_rs_vf_${XYZ}$ (j, k, l, i) = vR_rs_vf_${XYZ}$ (j, k, l, i) + omega(4)*poly(4)
+                                            #:endif
+                                        end if
 
                                     end do
                                 end do
diff --git a/toolchain/mfc/test/case.py b/toolchain/mfc/test/case.py
index 5a439aaead..c5ffdd301a 100644
--- a/toolchain/mfc/test/case.py
+++ b/toolchain/mfc/test/case.py
@@ -264,6 +264,8 @@ def compute_tolerance(self) -> float:
             tolerance = 1e-9
         elif self.params.get("mhd", 'F') == 'T':
             tolerance = 1e-8
+        elif "Axisymmetric" in self.trace.split(" -> "):
+            tolerance = 1e-11
 
         return 1e8 * tolerance if single else tolerance
 

From 1386ed981e315ce2150fcfa3575d5a930b59b6f3 Mon Sep 17 00:00:00 2001
From: Anand <anand@ipsec-10-2-68-64.vpn.gatech.edu>
Date: Sun, 25 Jan 2026 09:51:20 -0500
Subject: [PATCH 03/21] cleanup

---
 src/common/m_phase_change.fpp     |   6 +-
 src/simulation/m_bubbles_EL.fpp   |   2 +-
 src/simulation/m_cbc.fpp          |   9 ++-
 src/simulation/m_compute_cbc.fpp  | 103 ++++++++++++++++++++----------
 src/simulation/m_hyperelastic.fpp |   8 ++-
 src/simulation/m_qbmm.fpp         |  13 ++--
 src/simulation/m_weno.fpp         |   2 +-
 7 files changed, 94 insertions(+), 49 deletions(-)

diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 7a88a7b323..0ce4e24392 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -308,14 +308,16 @@ contains
         real(wp), intent(in) :: rhoe
         real(wp), intent(out) :: TS
         real(wp) :: gp, gpp, hp, pO, mCP, mQ !< variables for the Newton Solver
+        real(wp) :: p_infpT_sum
 
         integer :: i, ns !< generic loop iterators
 
         ! auxiliary variables for the pT-equilibrium solver
-        mCP = 0.0_wp; mQ = 0.0_wp; 
+        mCP = 0.0_wp; mQ = 0.0_wp; p_infpT_sum = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
         do i = 1, num_fluids 
             p_infpT(i) = ps_inf(i)
+            p_infpT_sum = p_infpT_sum + abs(p_infpT(i))
         end do
         ! Performing tests before initializing the pT-equilibrium
         $:GPU_LOOP(parallelism='[seq]')
@@ -332,7 +334,7 @@ contains
         if(num_fluids < 3) then
             $:GPU_LOOP(parallelism='[seq]')
             do i = num_fluids+1, 3 
-                p_infpT(i) = 10**32_wp
+                p_infpT(i) = p_infpT_sum
             end do
         end if
 
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 2f9937f140..28e9c8c27b 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -540,7 +540,7 @@ contains
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
         real(wp), dimension(3) :: myalpha_rho, myalpha
 #:else 
-        real(wp), dimension(contxe) :: myalpha_rho, myalpha
+        real(wp), dimension(num_fluids) :: myalpha_rho, myalpha
 #:endif
         real(wp), dimension(2) :: Re
         integer, dimension(3) :: cell
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 0f3f0011b7..bd0abfbd20 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -650,20 +650,23 @@ contains
         real(wp) :: dpi_inf_dt
         real(wp) :: dqv_dt
         real(wp) :: dpres_ds
+#:if USING_AMD 
+        real(wp), dimension(12) :: L
+#:else 
+        real(wp), dimension(sys_size) :: L 
+#:endif
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
         real(wp), dimension(3) :: alpha_rho, dalpha_rho_ds, mf
         real(wp), dimension(3) :: vel, dvel_ds
         real(wp), dimension(3) :: adv_local, dadv_ds
-        real(wp), dimension(12) :: L
         real(wp), dimension(3) :: dadv_dt
         real(wp), dimension(3) :: dvel_dt
         real(wp), dimension(3) :: dalpha_rho_dt
         real(wp), dimension(10) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
 #:else 
-        real(wp), dimension(contxe) :: alpha_rho, dalpha_rho_ds, mf
+        real(wp), dimension(num_fluids) :: alpha_rho, dalpha_rho_ds, mf
         real(wp), dimension(num_vels) :: vel, dvel_ds
         real(wp), dimension(num_fluids) :: adv_local, dadv_ds
-        real(wp), dimension(sys_size) :: L
         real(wp), dimension(num_fluids) :: dadv_dt
         real(wp), dimension(num_dims) :: dvel_dt
         real(wp), dimension(num_fluids) :: dalpha_rho_dt
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index aabd7d726d..aa0dd699f6 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -35,11 +35,14 @@ contains
     !> Fill density L variables
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
+        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+#:else   
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
 #:endif
         real(wp), intent(in) :: lambda_factor, lambda2, c
@@ -55,11 +58,14 @@ contains
     !> Fill velocity L variables
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
-        real(wp), dimension(3), intent(in) :: dvel_ds
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else         
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
 #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
@@ -74,11 +80,14 @@ contains
     !> Fill advection L variables
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
-        real(wp), dimension(3), intent(in) :: dadv_ds
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
+        real(wp), dimension(3), intent(in) :: dadv_ds
+#:else         
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
 #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
@@ -93,11 +102,14 @@ contains
     !> Fill chemistry L variables
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
-        real(wp), dimension(10), intent(in) :: dYs_ds
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
+        real(wp), dimension(10), intent(in) :: dYs_ds
+#:else    
         real(wp), dimension(num_species), intent(in) :: dYs_ds
 #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
@@ -117,11 +129,14 @@ contains
             & cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
-        real(wp), dimension(3), intent(in) :: dvel_ds
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
+        real(wp), dimension(3), intent(in) :: dvel_ds
+#:else        
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
 #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -138,14 +153,17 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD      
         real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(3), intent(in) :: dvel_ds
         real(wp), dimension(3), intent(in) :: dadv_ds
         real(wp), dimension(10), intent(in) :: dYs_ds
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else      
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -175,11 +193,14 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
         real(wp), dimension(3), intent(in) :: dvel_ds
-#:else
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else      
         real(wp), dimension(num_dims), intent(in) :: dvel_ds 
 #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
@@ -196,14 +217,17 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
         real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(3), intent(in) :: dvel_ds
         real(wp), dimension(3), intent(in) :: dadv_ds
         real(wp), dimension(10), intent(in) :: dYs_ds
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else        
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -226,13 +250,16 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
         real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(3), intent(in) :: dvel_ds
         real(wp), dimension(3), intent(in) :: dadv_ds
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else         
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -253,13 +280,16 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
         real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(3), intent(in) :: dvel_ds
         real(wp), dimension(3), intent(in) :: dadv_ds
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else        
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
@@ -279,10 +309,10 @@ contains
     subroutine s_compute_supersonic_inflow_L(L)
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
             & parallelism='[seq]', cray_inline=True)
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
 #:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+        real(wp), dimension(sys_size), intent(inout) :: L 
 #:endif
         L(1:advxe) = 0._wp
         if (chemistry) L(chemxb:chemxe) = 0._wp
@@ -294,14 +324,17 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD 
         real(wp), dimension(12), intent(inout) :: L
+#:else 
+        real(wp), dimension(sys_size), intent(inout) :: L 
+#:endif
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
         real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(3), intent(in) :: dvel_ds
         real(wp), dimension(3), intent(in) :: dadv_ds
         real(wp), dimension(10), intent(in) :: dYs_ds
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L
+#:else  
         real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
         real(wp), dimension(num_dims), intent(in) :: dvel_ds
         real(wp), dimension(num_fluids), intent(in) :: dadv_ds
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 599ef3302f..52914d9938 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -98,11 +98,15 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD
         real(wp), dimension(9) :: tensora, tensorb
-        real(wp), dimension(3) :: alpha_k, alpha_rho_k
 #:else 
         real(wp), dimension(tensor_size) :: tensora, tensorb
+#:endif
+
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: alpha_k, alpha_rho_k
+#:else 
         real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
 #:endif
         real(wp), dimension(2) :: Re
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 2f2dd46ed5..53596338a2 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -571,7 +571,7 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD
         real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
 #:else
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -650,7 +650,7 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD
         real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
 #:else
         real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
@@ -726,11 +726,14 @@ contains
 #:if not MFC_CASE_OPTIMIZATION and USING_AMD
         real(wp), dimension(6) :: moms, msum
         real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
-        real(wp), dimension(32, 0:2, 0:2) :: coeff
 #:else 
         real(wp), dimension(nmom) :: moms, msum
         real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
-        real(wp), dimension(nterms, 0:2, 0:2) :: coeff
+#:endif
+#:if USING_AMD
+        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
+#:else
+        real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
 #:endif
         real(wp) :: pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, grad_T
         real(wp) :: n_tait, B_tait
@@ -884,7 +887,7 @@ contains
             $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', &
                 & cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+#:if USING_AMD
             real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
 #:else
             real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index ff6a0e5f93..ed1f5576ae 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -647,7 +647,7 @@ contains
         integer, intent(in) :: weno_dir
         type(int_bounds_info), intent(in) :: is1_weno_d, is2_weno_d, is3_weno_d
 
-#:if USING_AMD and not MFC_CASE_OPTIMIZATION
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
         real(wp), dimension(-3:2) :: dvd
         real(wp), dimension(0:4) :: poly
         real(wp), dimension(0:4) :: alpha

From f3ee9fa1114eb53ae034708e01d54030b8ea74c7 Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@login10.frontier.olcf.ornl.gov>
Date: Mon, 26 Jan 2026 21:27:39 -0500
Subject: [PATCH 04/21] 1D Dirichlet case fixed

---
 CMakeLists.txt                   | 10 ++++------
 src/common/m_boundary_common.fpp |  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c63582a3e..77f398be7e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,11 +185,9 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
         add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
     endif()
 
-elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
     add_compile_options(
-        $<$<COMPILE_LANGUAGE:Fortran>:-Mfreeform>
-        $<$<COMPILE_LANGUAGE:Fortran>:-Mpreprocess>
-        $<$<COMPILE_LANGUAGE:Fortran>:-fdefault-real-8>
+	    #$<$<COMPILE_LANGUAGE:Fortran>:-Wall>
     )
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
     add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>)
@@ -529,8 +527,8 @@ function(MFC_SETUP_TARGET)
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_INC})
-		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_LIB})
+		    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_INC})
+		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a  $ENV{CRAY_MPICH_LIB})
                 endif()
             endif()
 
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 333df31442..8f5412f149 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -44,7 +44,7 @@ module m_boundary_common
  s_populate_grid_variables_buffers, &
  s_finalize_boundary_common_module
 
-    !public :: bc_buffers
+    public :: bc_buffers
 
 #ifdef MFC_MPI
     public :: MPI_BC_TYPE_TYPE, MPI_BC_BUFFER_TYPE
@@ -56,7 +56,7 @@ contains
 
         integer :: i, j 
 
-        @:ALLOCATE(bc_buffers(1:num_dims, 1:2))
+        @:ALLOCATE(bc_buffers(1:3, 1:2))
 
         if (bc_io) then
             @:ALLOCATE(bc_buffers(1, 1)%sf(1:sys_size, 0:n, 0:p))

From fe48823526c176f4822261133a88ab226919e16e Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@login10.frontier.olcf.ornl.gov>
Date: Mon, 26 Jan 2026 22:29:05 -0500
Subject: [PATCH 05/21] cleanup

---
 src/common/m_phase_change.fpp         |   2 +
 src/common/m_variables_conversion.fpp |   2 +-
 src/simulation/m_fftw.fpp             | 462 +++++++++++++-------------
 src/simulation/m_qbmm.fpp             |   4 +-
 4 files changed, 236 insertions(+), 234 deletions(-)

diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index 0ce4e24392..a7742c0532 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -331,12 +331,14 @@ contains
 
         end do
 
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
         if(num_fluids < 3) then
             $:GPU_LOOP(parallelism='[seq]')
             do i = num_fluids+1, 3 
                 p_infpT(i) = p_infpT_sum
             end do
         end if
+#:endif
 
         ! Checking energy constraint
         if ((rhoe - mQ - minval(p_infpT)) < 0.0_wp) then
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index beb38bacf5..bc20350c91 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -253,7 +253,7 @@ contains
 
         real(wp), optional, dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out) :: G_K 
-        real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
+        real(wp), dimension(num_fluids) :: alpha_rho_K, alpha_K !<
         real(wp), optional, dimension(num_fluids), intent(in) :: G
         
 
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 411485af10..55f8c8d123 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -70,55 +70,55 @@ contains
         !!      applying the Fourier filter in the azimuthal direction.
     impure subroutine s_initialize_fftw_module
 
-        integer :: ierr !< Generic flag used to identify and report GPU errors
-
-        ! Size of input array going into DFT
-        real_size = p + 1
-        ! Size of output array coming out of DFT
-        cmplx_size = (p + 1)/2 + 1
-
-        x_size = m + 1
-        batch_size = x_size*sys_size
-
-#if defined(MFC_GPU)
-        rank = 1; istride = 1; ostride = 1
-
-        allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
-
-        gpu_fft_size(1) = real_size; 
-        iembed(1) = 0
-        oembed(1) = 0
-        $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
-        $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
-#else
-        ! Allocate input and output DFT data sizes
-        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
-        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-        ! Associate input and output data pointers with allocated memory
-        call c_f_pointer(fftw_real_data, data_real, [real_size])
-        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
-        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
-
-        ! Generate plans for forward and backward DFTs
-        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
-        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
-#endif
-
-#if defined(MFC_GPU)
-        @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
-        @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
-        @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
-
-#if defined(__PGI)
-        ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size)
-        ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size)
-#else
-        ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size)
-        ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size)
-#endif
-
-#endif
+!         integer :: ierr !< Generic flag used to identify and report GPU errors
+
+!         ! Size of input array going into DFT
+!         real_size = p + 1
+!         ! Size of output array coming out of DFT
+!         cmplx_size = (p + 1)/2 + 1
+
+!         x_size = m + 1
+!         batch_size = x_size*sys_size
+
+! #if defined(MFC_GPU)
+!         rank = 1; istride = 1; ostride = 1
+
+!         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
+
+!         gpu_fft_size(1) = real_size; 
+!         iembed(1) = 0
+!         oembed(1) = 0
+!         $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
+!         $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
+! #else
+!         ! Allocate input and output DFT data sizes
+!         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
+!         fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+!         fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+!         ! Associate input and output data pointers with allocated memory
+!         call c_f_pointer(fftw_real_data, data_real, [real_size])
+!         call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
+!         call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
+
+!         ! Generate plans for forward and backward DFTs
+!         fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
+!         bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
+! #endif
+
+! #if defined(MFC_GPU)
+!         @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
+!         @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+!         @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+
+! #if defined(__PGI)
+!         ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size)
+!         ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size)
+! #else
+!         ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size)
+!         ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size)
+! #endif
+
+! #endif
 
     end subroutine s_initialize_fftw_module
 
@@ -130,169 +130,169 @@ contains
     impure subroutine s_apply_fourier_filter(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-        integer :: i, j, k, l !< Generic loop iterators
-        integer :: ierr !< Generic flag used to identify and report GPU errors
-
-        ! Restrict filter to processors that have cells adjacent to axis
-        if (bc_y%beg >= 0) return
-#if defined(MFC_GPU)
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 1, cmplx_size
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 0, p
-                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-#if defined(__PGI)
-            ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-#else
-            ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-            call hipCheck(hipDeviceSynchronize())
-#endif
-        #:endcall GPU_HOST_DATA
-        Nfq = 3
-        $:GPU_UPDATE(device='[Nfq]')
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 1, Nfq
-                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-#if defined(__PGI)
-            ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-#else
-            ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-            call hipCheck(hipDeviceSynchronize())
-#endif
-        #:endcall GPU_HOST_DATA
-
-        $:GPU_PARALLEL_LOOP(collapse=3)
-        do k = 1, sys_size
-            do j = 0, m
-                do l = 0, p
-                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
-                end do
-            end do
-        end do
-        $:END_GPU_PARALLEL_LOOP()
-
-        do i = 1, fourier_rings
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 1, cmplx_size
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-#if defined(__PGI)
-                ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-#else
-                ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-                call hipCheck(hipDeviceSynchronize())
-#endif
-            #:endcall GPU_HOST_DATA
-
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            $:GPU_UPDATE(device='[Nfq]')
-
-            $:GPU_PARALLEL_LOOP(collapse=3)
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 1, Nfq
-                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-
-            #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-#if defined(__PGI)
-                ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-#else
-                ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-                call hipCheck(hipDeviceSynchronize())
-#endif
-            #:endcall GPU_HOST_DATA
-
-            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-            do k = 1, sys_size
-                do j = 0, m
-                    do l = 0, p
-                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-                        q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
-                    end do
-                end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
-        end do
-
-#else
-        Nfq = 3
-        do j = 0, m
-            do k = 1, sys_size
-                data_fltr_cmplx(:) = (0_dp, 0_dp)
-                data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
-                call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-                data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-                call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-                data_real(:) = data_real(:)/real(real_size, dp)
-                q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
-            end do
-        end do
-
-        ! Apply Fourier filter to additional rings
-        do i = 1, fourier_rings
-            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-            do j = 0, m
-                do k = 1, sys_size
-                    data_fltr_cmplx(:) = (0_dp, 0_dp)
-                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
-                    call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-                    data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-                    call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-                    data_real(:) = data_real(:)/real(real_size, dp)
-                    q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
-                end do
-            end do
-        end do
-#endif
+!         integer :: i, j, k, l !< Generic loop iterators
+!         integer :: ierr !< Generic flag used to identify and report GPU errors
+
+!         ! Restrict filter to processors that have cells adjacent to axis
+!         if (bc_y%beg >= 0) return
+! #if defined(MFC_GPU)
+
+!         $:GPU_PARALLEL_LOOP(collapse=3)
+!         do k = 1, sys_size
+!             do j = 0, m
+!                 do l = 1, cmplx_size
+!                     data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+!                 end do
+!             end do
+!         end do
+!         $:END_GPU_PARALLEL_LOOP()
+
+!         $:GPU_PARALLEL_LOOP(collapse=3)
+!         do k = 1, sys_size
+!             do j = 0, m
+!                 do l = 0, p
+!                     data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
+!                 end do
+!             end do
+!         end do
+!         $:END_GPU_PARALLEL_LOOP()
+
+!         #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+! #if defined(__PGI)
+!             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+! #else
+!             ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+!             call hipCheck(hipDeviceSynchronize())
+! #endif
+!         #:endcall GPU_HOST_DATA
+!         Nfq = 3
+!         $:GPU_UPDATE(device='[Nfq]')
+
+!         $:GPU_PARALLEL_LOOP(collapse=3)
+!         do k = 1, sys_size
+!             do j = 0, m
+!                 do l = 1, Nfq
+!                     data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+!                 end do
+!             end do
+!         end do
+!         $:END_GPU_PARALLEL_LOOP()
+
+!         #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+! #if defined(__PGI)
+!             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+! #else
+!             ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+!             call hipCheck(hipDeviceSynchronize())
+! #endif
+!         #:endcall GPU_HOST_DATA
+
+!         $:GPU_PARALLEL_LOOP(collapse=3)
+!         do k = 1, sys_size
+!             do j = 0, m
+!                 do l = 0, p
+!                     data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+!                     q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+!                 end do
+!             end do
+!         end do
+!         $:END_GPU_PARALLEL_LOOP()
+
+!         do i = 1, fourier_rings
+
+!             $:GPU_PARALLEL_LOOP(collapse=3)
+!             do k = 1, sys_size
+!                 do j = 0, m
+!                     do l = 1, cmplx_size
+!                         data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+!                     end do
+!                 end do
+!             end do
+!             $:END_GPU_PARALLEL_LOOP()
+
+!             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+!             do k = 1, sys_size
+!                 do j = 0, m
+!                     do l = 0, p
+!                         data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+!                     end do
+!                 end do
+!             end do
+!             $:END_GPU_PARALLEL_LOOP()
+
+!             #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+! #if defined(__PGI)
+!                 ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+! #else
+!                 ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+!                 call hipCheck(hipDeviceSynchronize())
+! #endif
+!             #:endcall GPU_HOST_DATA
+
+!             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+!             $:GPU_UPDATE(device='[Nfq]')
+
+!             $:GPU_PARALLEL_LOOP(collapse=3)
+!             do k = 1, sys_size
+!                 do j = 0, m
+!                     do l = 1, Nfq
+!                         data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+!                     end do
+!                 end do
+!             end do
+!             $:END_GPU_PARALLEL_LOOP()
+
+!             #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+! #if defined(__PGI)
+!                 ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+! #else
+!                 ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+!                 call hipCheck(hipDeviceSynchronize())
+! #endif
+!             #:endcall GPU_HOST_DATA
+
+!             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+!             do k = 1, sys_size
+!                 do j = 0, m
+!                     do l = 0, p
+!                         data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+!                         q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+!                     end do
+!                 end do
+!             end do
+!             $:END_GPU_PARALLEL_LOOP()
+!         end do
+
+! #else
+!         Nfq = 3
+!         do j = 0, m
+!             do k = 1, sys_size
+!                 data_fltr_cmplx(:) = (0_dp, 0_dp)
+!                 data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
+!                 call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+!                 data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+!                 call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+!                 data_real(:) = data_real(:)/real(real_size, dp)
+!                 q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
+!             end do
+!         end do
+
+!         ! Apply Fourier filter to additional rings
+!         do i = 1, fourier_rings
+!             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+!             do j = 0, m
+!                 do k = 1, sys_size
+!                     data_fltr_cmplx(:) = (0_dp, 0_dp)
+!                     data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+!                     call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+!                     data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+!                     call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+!                     data_real(:) = data_real(:)/real(real_size, dp)
+!                     q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+!                 end do
+!             end do
+!         end do
+! #endif
 
     end subroutine s_apply_fourier_filter
 
@@ -301,25 +301,25 @@ contains
         !!      applying the Fourier filter in the azimuthal direction.
     impure subroutine s_finalize_fftw_module
 
-#if defined(MFC_GPU)
-        integer :: ierr !< Generic flag used to identify and report GPU errors
-        @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
-#if defined(__PGI)
-
-        ierr = cufftDestroy(fwd_plan_gpu)
-        ierr = cufftDestroy(bwd_plan_gpu)
-#else
-        ierr = hipfftDestroy(fwd_plan_gpu)
-        ierr = hipfftDestroy(bwd_plan_gpu)
-#endif
-#else
-        call fftw_free(fftw_real_data)
-        call fftw_free(fftw_cmplx_data)
-        call fftw_free(fftw_fltr_cmplx_data)
-
-        call fftw_destroy_plan(fwd_plan)
-        call fftw_destroy_plan(bwd_plan)
-#endif
+! #if defined(MFC_GPU)
+!         integer :: ierr !< Generic flag used to identify and report GPU errors
+!         @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
+! #if defined(__PGI)
+
+!         ierr = cufftDestroy(fwd_plan_gpu)
+!         ierr = cufftDestroy(bwd_plan_gpu)
+! #else
+!         ierr = hipfftDestroy(fwd_plan_gpu)
+!         ierr = hipfftDestroy(bwd_plan_gpu)
+! #endif
+! #else
+!         call fftw_free(fftw_real_data)
+!         call fftw_free(fftw_cmplx_data)
+!         call fftw_free(fftw_fltr_cmplx_data)
+
+!         call fftw_destroy_plan(fwd_plan)
+!         call fftw_destroy_plan(bwd_plan)
+! #endif
 
     end subroutine s_finalize_fftw_module
 end module m_fftw
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 53596338a2..4eae61eec9 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -731,9 +731,9 @@ contains
         real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
 #:endif
 #:if USING_AMD
-        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
+        real(wp), dimension(32, 0:2, 0:2) :: coeff
 #:else
-        real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
+        real(wp), dimension(nterms, 0:2, 0:2) :: coeff
 #:endif
         real(wp) :: pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, grad_T
         real(wp) :: n_tait, B_tait

From fc6e65bdf0a361c3742def15253351805fd5c02f Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@frontier10398.frontier.olcf.ornl.gov>
Date: Tue, 27 Jan 2026 01:44:29 -0500
Subject: [PATCH 06/21] Phase Change fixed (539/546 cases work)

---
 src/common/m_phase_change.fpp | 268 ++++++++------------
 src/simulation/m_fftw.fpp     | 462 +++++++++++++++++-----------------
 2 files changed, 337 insertions(+), 393 deletions(-)

diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index a7742c0532..b1505a44eb 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -434,6 +434,7 @@ contains
         real(wp), dimension(2) :: R2D, DeltamP !< residual and correction array
         real(wp) :: Om ! underrelaxation factor
         real(wp) :: mCP, mCPD, mCVGP, mCVGP2, mQ, mQD ! auxiliary variables for the pTg-solver
+        real(wp) :: ml, mT, dFdT, dTdm, dTdp
 
         !< Generic loop iterators
         integer :: i, ns
@@ -506,11 +507,85 @@ contains
             end do
 
             ! calculating the (2D) Jacobian Matrix used in the solution of the pTg-quilibrium model
-            call s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac)
+
+            ! mass of the reacting liquid
+            ml = q_cons_vf(lp + contxb - 1)%sf(j, k, l)
+
+            ! mass of the two participating fluids
+            mT = q_cons_vf(lp + contxb - 1)%sf(j, k, l) &
+                 + q_cons_vf(vp + contxb - 1)%sf(j, k, l)
+
+            TS = 1/(mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) &
+                    + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                          - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                    + mCVGP)
+
+            dFdT = &
+                -(cvs(lp)*gs_min(lp) - cvs(vp)*gs_min(vp))*log(TS) &
+                - (qvps(lp) - qvps(vp)) &
+                + cvs(lp)*(gs_min(lp) - 1)*log(pS + ps_inf(lp)) &
+                - cvs(vp)*(gs_min(vp) - 1)*log(pS + ps_inf(vp))
+
+            dTdm = -(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                     - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)))*TS**2
+
+            dTdp = (mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2 &
+                    + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp))**2 &
+                          - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2) &
+                    + mCVGP2)*TS**2
+
+            ! F = (F1,F2) is the function whose roots we are looking for
+            ! x = (m1, p) are the independent variables. m1 = mass of the first participant fluid, p = pressure
+            ! F1 = 0 is the Gibbs free energy quality
+            ! F2 = 0 is the enforcement of the thermodynamic (total - kinectic) energy
+            ! dF1dm
+            Jac(1, 1) = dFdT*dTdm
+
+            ! dF1dp
+            Jac(1, 2) = dFdT*dTdp + TS &
+                        *(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                          - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)))
+
+            ! dF2dm
+            Jac(2, 1) = (qvs(vp) - qvs(lp) &
+                         + (cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
+                         /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                               - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                           + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP) &
+                         - (ml*(cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
+                            - mT*cvs(vp)*gs_min(vp) - mCPD) &
+                         *(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                           - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                         /((ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                                - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                            + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP)**2))/1
+            ! dF2dp
+            Jac(2, 2) = (1 + (ml*(cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
+                              - mT*cvs(vp)*gs_min(vp) - mCPD) &
+                         *(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp))**2 &
+                               - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2) &
+                           + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2 + mCVGP2) &
+                         /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                               - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                           + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP)**2)/1
+
+            ! intermediate elements of J^{-1}
+            InvJac(1, 1) = Jac(2, 2)
+            InvJac(1, 2) = -1.0_wp*Jac(1, 2)
+            InvJac(2, 1) = -1.0_wp*Jac(2, 1)
+            InvJac(2, 2) = Jac(1, 1)
+
+            ! elements of J^{T}
+            TJac(1, 1) = Jac(1, 1)
+            TJac(1, 2) = Jac(2, 1)
+            TJac(2, 1) = Jac(1, 2)
+            TJac(2, 2) = Jac(2, 2)
+
+            ! dividing by det(J)
+            InvJac = InvJac / (Jac(1, 1)*Jac(2, 2) - Jac(1, 2)*Jac(2, 1))
 
             ! calculating correction array for Newton's method
-            DeltamP(1) = -1.0_wp*(InvJac(1,1)*R2D(1) + InvJac(1,2)*R2D(2))
-            DeltamP(2) = -1.0_wp*(InvJac(2,1)*R2D(1) + InvJac(2,2)*R2D(2))
+            DeltamP = -1.0_wp*(matmul(InvJac, R2D))
 
             ! updating two reacting 'masses'. Recall that inert 'masses' do not change during the phase change
             ! liquid
@@ -524,7 +599,34 @@ contains
 
             ! calculating residuals, which are (i) the difference between the Gibbs Free energy of the gas and the liquid
             ! and (ii) the energy before and after the phase-change process.
-            call s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D)
+
+            ! mass of the reacting liquid
+            ml = q_cons_vf(lp + contxb - 1)%sf(j, k, l)
+
+            ! mass of the two participating fluids
+            mT = q_cons_vf(lp + contxb - 1)%sf(j, k, l) &
+                 + q_cons_vf(vp + contxb - 1)%sf(j, k, l)
+
+            TS = 1/(mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) &
+                    + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                          - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                    + mCVGP)
+
+            ! Gibbs Free Energy Equality condition (DG)
+            R2D(1) = TS*((cvs(lp)*gs_min(lp) - cvs(vp)*gs_min(vp)) &
+                         *(1 - log(TS)) - (qvps(lp) - qvps(vp)) &
+                         + cvs(lp)*(gs_min(lp) - 1)*log(pS + ps_inf(lp)) &
+                         - cvs(vp)*(gs_min(vp) - 1)*log(pS + ps_inf(vp))) &
+                     + qvs(lp) - qvs(vp)
+
+            ! Constant Energy Process condition (DE)
+            R2D(2) = (rhoe + pS &
+                      + ml*(qvs(vp) - qvs(lp)) - mT*qvs(vp) - mQD &
+                      + (ml*(gs_min(vp)*cvs(vp) - gs_min(lp)*cvs(lp)) &
+                         - mT*gs_min(vp)*cvs(vp) - mCPD) &
+                      /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
+                            - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
+                        + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP))/1
 
         end do
 
@@ -586,164 +688,6 @@ contains
         end if
     end subroutine s_correct_partial_densities
 
-    !>  This auxiliary subroutine calculates the 2 x 2 Jacobian and, its inverse and transpose
-        !!      to be used in the pTg-equilibirium procedure
-        !!  @param InvJac Inverse of the Jacobian Matrix
-        !!  @param j generic loop iterator for x direction
-        !!  @param Jac Jacobian Matrix
-        !!  @param k generic loop iterator for y direction
-        !!  @param l generic loop iterator for z direction
-        !!  @param mCPD  sum of the total alpha*rho*cp
-        !!  @param mCVGP auxiliary variable for the calculation of the matrices: alpha*rho*cv*(g-1)/press
-        !!  @param mCVGP2 auxiliary variable for the calculation of the matrices: alpha*rho*cv*(g-1)/press^2
-        !!  @param pS equilibrium pressure at the interface
-        !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param TJac Transpose of the Jacobian Matrix
-    subroutine s_compute_jacobian_matrix(InvJac, j, Jac, k, l, mCPD, mCVGP, mCVGP2, pS, q_cons_vf, TJac)
-        $:GPU_ROUTINE(function_name='s_compute_jacobian_matrix', &
-            & parallelism='[seq]', cray_inline=True)
-
-        real(wp), dimension(2, 2), intent(out) :: InvJac
-        integer, intent(in) :: j
-        real(wp), dimension(2, 2), intent(out) :: Jac
-        integer, intent(in) :: k, l
-        real(wp), intent(in) :: mCPD, mCVGP, mCVGP2, pS
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp), dimension(2, 2), intent(out) :: TJac
-
-        real(wp) :: ml, mT, TS, dFdT, dTdm, dTdp ! mass of the reacting fluid, total reacting mass, and auxiliary variables
-
-        ! mass of the reacting liquid
-        ml = q_cons_vf(lp + contxb - 1)%sf(j, k, l)
-
-        ! mass of the two participating fluids
-        mT = q_cons_vf(lp + contxb - 1)%sf(j, k, l) &
-             + q_cons_vf(vp + contxb - 1)%sf(j, k, l)
-
-        TS = 1/(mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) &
-                + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                      - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                + mCVGP)
-
-        dFdT = &
-            -(cvs(lp)*gs_min(lp) - cvs(vp)*gs_min(vp))*log(TS) &
-            - (qvps(lp) - qvps(vp)) &
-            + cvs(lp)*(gs_min(lp) - 1)*log(pS + ps_inf(lp)) &
-            - cvs(vp)*(gs_min(vp) - 1)*log(pS + ps_inf(vp))
-
-        dTdm = -(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                 - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)))*TS**2
-
-        dTdp = (mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2 &
-                + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp))**2 &
-                      - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2) &
-                + mCVGP2)*TS**2
-
-        ! F = (F1,F2) is the function whose roots we are looking for
-        ! x = (m1, p) are the independent variables. m1 = mass of the first participant fluid, p = pressure
-        ! F1 = 0 is the Gibbs free energy quality
-        ! F2 = 0 is the enforcement of the thermodynamic (total - kinectic) energy
-        ! dF1dm
-        Jac(1, 1) = dFdT*dTdm
-
-        ! dF1dp
-        Jac(1, 2) = dFdT*dTdp + TS &
-                    *(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                      - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)))
-
-        ! dF2dm
-        Jac(2, 1) = (qvs(vp) - qvs(lp) &
-                     + (cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
-                     /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                           - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                       + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP) &
-                     - (ml*(cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
-                        - mT*cvs(vp)*gs_min(vp) - mCPD) &
-                     *(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                       - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                     /((ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                            - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                        + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP)**2))/1
-        ! dF2dp
-        Jac(2, 2) = (1 + (ml*(cvs(vp)*gs_min(vp) - cvs(lp)*gs_min(lp)) &
-                          - mT*cvs(vp)*gs_min(vp) - mCPD) &
-                     *(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp))**2 &
-                           - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2) &
-                       + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))**2 + mCVGP2) &
-                     /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                           - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                       + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP)**2)/1
-
-        ! intermediate elements of J^{-1}
-        InvJac(1, 1) = Jac(2, 2)
-        InvJac(1, 2) = -1.0_wp*Jac(1, 2)
-        InvJac(2, 1) = -1.0_wp*Jac(2, 1)
-        InvJac(2, 2) = Jac(1, 1)
-
-        ! elements of J^{T}
-        TJac(1, 1) = Jac(1, 1)
-        TJac(1, 2) = Jac(2, 1)
-        TJac(2, 1) = Jac(1, 2)
-        TJac(2, 2) = Jac(2, 2)
-
-        ! dividing by det(J)
-        InvJac = InvJac/(Jac(1, 1)*Jac(2, 2) - Jac(1, 2)*Jac(2, 1))
-
-    end subroutine s_compute_jacobian_matrix
-
-    !>  This auxiliary subroutine computes the residue of the pTg-equilibrium procedure
-        !!  @param j generic loop iterator for x direction
-        !!  @param k generic loop iterator for y direction
-        !!  @param l generic loop iterator for z direction
-        !!  @param mCPD  sum of the total alpha*rho*cp
-        !!  @param mCVGP auxiliary variable for the calculation of the matrices: alpha*rho*cv*(g-1)/press
-        !!  @param mQD sum of the total alpha*rho*qv
-        !!  @param q_cons_vf Cell-average conservative variables
-        !!  @param pS equilibrium pressure at the interface
-        !!  @param rhoe mixture energy
-        !!  @param R2D (2D) residue array
-    subroutine s_compute_pTg_residue(j, k, l, mCPD, mCVGP, mQD, q_cons_vf, pS, rhoe, R2D)
-        $:GPU_ROUTINE(function_name='s_compute_pTg_residue', &
-            & parallelism='[seq]', cray_inline=True)
-
-        integer, intent(in) :: j, k, l
-        real(wp), intent(in) :: mCPD, mCVGP, mQD
-        type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
-        real(wp), intent(in) :: pS, rhoe
-        real(wp), dimension(2), intent(out) :: R2D
-
-        real(wp) :: ml, mT, TS !< mass of the reacting liquid, total reacting mass, equilibrium temperature
-
-        ! mass of the reacting liquid
-        ml = q_cons_vf(lp + contxb - 1)%sf(j, k, l)
-
-        ! mass of the two participating fluids
-        mT = q_cons_vf(lp + contxb - 1)%sf(j, k, l) &
-             + q_cons_vf(vp + contxb - 1)%sf(j, k, l)
-
-        TS = 1/(mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) &
-                + ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                      - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                + mCVGP)
-
-        ! Gibbs Free Energy Equality condition (DG)
-        R2D(1) = TS*((cvs(lp)*gs_min(lp) - cvs(vp)*gs_min(vp)) &
-                     *(1 - log(TS)) - (qvps(lp) - qvps(vp)) &
-                     + cvs(lp)*(gs_min(lp) - 1)*log(pS + ps_inf(lp)) &
-                     - cvs(vp)*(gs_min(vp) - 1)*log(pS + ps_inf(vp))) &
-                 + qvs(lp) - qvs(vp)
-
-        ! Constant Energy Process condition (DE)
-        R2D(2) = (rhoe + pS &
-                  + ml*(qvs(vp) - qvs(lp)) - mT*qvs(vp) - mQD &
-                  + (ml*(gs_min(vp)*cvs(vp) - gs_min(lp)*cvs(lp)) &
-                     - mT*gs_min(vp)*cvs(vp) - mCPD) &
-                  /(ml*(cvs(lp)*(gs_min(lp) - 1)/(pS + ps_inf(lp)) &
-                        - cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp))) &
-                    + mT*cvs(vp)*(gs_min(vp) - 1)/(pS + ps_inf(vp)) + mCVGP))/1
-
-    end subroutine s_compute_pTg_residue
-
     !>  This auxiliary subroutine finds the Saturation temperature for a given
         !!      saturation pressure through a newton solver
         !!  @param pSat Saturation Pressure
diff --git a/src/simulation/m_fftw.fpp b/src/simulation/m_fftw.fpp
index 55f8c8d123..411485af10 100644
--- a/src/simulation/m_fftw.fpp
+++ b/src/simulation/m_fftw.fpp
@@ -70,55 +70,55 @@ contains
         !!      applying the Fourier filter in the azimuthal direction.
     impure subroutine s_initialize_fftw_module
 
-!         integer :: ierr !< Generic flag used to identify and report GPU errors
-
-!         ! Size of input array going into DFT
-!         real_size = p + 1
-!         ! Size of output array coming out of DFT
-!         cmplx_size = (p + 1)/2 + 1
-
-!         x_size = m + 1
-!         batch_size = x_size*sys_size
-
-! #if defined(MFC_GPU)
-!         rank = 1; istride = 1; ostride = 1
-
-!         allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
-
-!         gpu_fft_size(1) = real_size; 
-!         iembed(1) = 0
-!         oembed(1) = 0
-!         $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
-!         $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
-! #else
-!         ! Allocate input and output DFT data sizes
-!         fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
-!         fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-!         fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
-!         ! Associate input and output data pointers with allocated memory
-!         call c_f_pointer(fftw_real_data, data_real, [real_size])
-!         call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
-!         call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
-
-!         ! Generate plans for forward and backward DFTs
-!         fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
-!         bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
-! #endif
-
-! #if defined(MFC_GPU)
-!         @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
-!         @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
-!         @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
-
-! #if defined(__PGI)
-!         ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size)
-!         ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size)
-! #else
-!         ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size)
-!         ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size)
-! #endif
-
-! #endif
+        integer :: ierr !< Generic flag used to identify and report GPU errors
+
+        ! Size of input array going into DFT
+        real_size = p + 1
+        ! Size of output array coming out of DFT
+        cmplx_size = (p + 1)/2 + 1
+
+        x_size = m + 1
+        batch_size = x_size*sys_size
+
+#if defined(MFC_GPU)
+        rank = 1; istride = 1; ostride = 1
+
+        allocate (gpu_fft_size(1:rank), iembed(1:rank), oembed(1:rank))
+
+        gpu_fft_size(1) = real_size; 
+        iembed(1) = 0
+        oembed(1) = 0
+        $:GPU_ENTER_DATA(copyin='[real_size,cmplx_size,x_size,sys_size,batch_size,Nfq]')
+        $:GPU_UPDATE(device='[real_size,cmplx_size,x_size,sys_size,batch_size]')
+#else
+        ! Allocate input and output DFT data sizes
+        fftw_real_data = fftw_alloc_real(int(real_size, c_size_t))
+        fftw_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        fftw_fltr_cmplx_data = fftw_alloc_complex(int(cmplx_size, c_size_t))
+        ! Associate input and output data pointers with allocated memory
+        call c_f_pointer(fftw_real_data, data_real, [real_size])
+        call c_f_pointer(fftw_cmplx_data, data_cmplx, [cmplx_size])
+        call c_f_pointer(fftw_fltr_cmplx_data, data_fltr_cmplx, [cmplx_size])
+
+        ! Generate plans for forward and backward DFTs
+        fwd_plan = fftw_plan_dft_r2c_1d(real_size, data_real, data_cmplx, FFTW_ESTIMATE)
+        bwd_plan = fftw_plan_dft_c2r_1d(real_size, data_fltr_cmplx, data_real, FFTW_ESTIMATE)
+#endif
+
+#if defined(MFC_GPU)
+        @:ALLOCATE(data_real_gpu(1:real_size*x_size*sys_size))
+        @:ALLOCATE(data_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+        @:ALLOCATE(data_fltr_cmplx_gpu(1:cmplx_size*x_size*sys_size))
+
+#if defined(__PGI)
+        ierr = cufftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, CUFFT_D2Z, batch_size)
+        ierr = cufftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, CUFFT_Z2D, batch_size)
+#else
+        ierr = hipfftPlanMany(fwd_plan_gpu, rank, gpu_fft_size, iembed, istride, real_size, oembed, ostride, cmplx_size, HIPFFT_D2Z, batch_size)
+        ierr = hipfftPlanMany(bwd_plan_gpu, rank, gpu_fft_size, iembed, istride, cmplx_size, oembed, ostride, real_size, HIPFFT_Z2D, batch_size)
+#endif
+
+#endif
 
     end subroutine s_initialize_fftw_module
 
@@ -130,169 +130,169 @@ contains
     impure subroutine s_apply_fourier_filter(q_cons_vf)
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
-!         integer :: i, j, k, l !< Generic loop iterators
-!         integer :: ierr !< Generic flag used to identify and report GPU errors
-
-!         ! Restrict filter to processors that have cells adjacent to axis
-!         if (bc_y%beg >= 0) return
-! #if defined(MFC_GPU)
-
-!         $:GPU_PARALLEL_LOOP(collapse=3)
-!         do k = 1, sys_size
-!             do j = 0, m
-!                 do l = 1, cmplx_size
-!                     data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
-!                 end do
-!             end do
-!         end do
-!         $:END_GPU_PARALLEL_LOOP()
-
-!         $:GPU_PARALLEL_LOOP(collapse=3)
-!         do k = 1, sys_size
-!             do j = 0, m
-!                 do l = 0, p
-!                     data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
-!                 end do
-!             end do
-!         end do
-!         $:END_GPU_PARALLEL_LOOP()
-
-!         #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-! #if defined(__PGI)
-!             ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-! #else
-!             ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-!             call hipCheck(hipDeviceSynchronize())
-! #endif
-!         #:endcall GPU_HOST_DATA
-!         Nfq = 3
-!         $:GPU_UPDATE(device='[Nfq]')
-
-!         $:GPU_PARALLEL_LOOP(collapse=3)
-!         do k = 1, sys_size
-!             do j = 0, m
-!                 do l = 1, Nfq
-!                     data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
-!                 end do
-!             end do
-!         end do
-!         $:END_GPU_PARALLEL_LOOP()
-
-!         #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-! #if defined(__PGI)
-!             ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-! #else
-!             ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-!             call hipCheck(hipDeviceSynchronize())
-! #endif
-!         #:endcall GPU_HOST_DATA
-
-!         $:GPU_PARALLEL_LOOP(collapse=3)
-!         do k = 1, sys_size
-!             do j = 0, m
-!                 do l = 0, p
-!                     data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-!                     q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
-!                 end do
-!             end do
-!         end do
-!         $:END_GPU_PARALLEL_LOOP()
-
-!         do i = 1, fourier_rings
-
-!             $:GPU_PARALLEL_LOOP(collapse=3)
-!             do k = 1, sys_size
-!                 do j = 0, m
-!                     do l = 1, cmplx_size
-!                         data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
-!                     end do
-!                 end do
-!             end do
-!             $:END_GPU_PARALLEL_LOOP()
-
-!             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-!             do k = 1, sys_size
-!                 do j = 0, m
-!                     do l = 0, p
-!                         data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
-!                     end do
-!                 end do
-!             end do
-!             $:END_GPU_PARALLEL_LOOP()
-
-!             #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-! #if defined(__PGI)
-!                 ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-! #else
-!                 ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
-!                 call hipCheck(hipDeviceSynchronize())
-! #endif
-!             #:endcall GPU_HOST_DATA
-
-!             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-!             $:GPU_UPDATE(device='[Nfq]')
-
-!             $:GPU_PARALLEL_LOOP(collapse=3)
-!             do k = 1, sys_size
-!                 do j = 0, m
-!                     do l = 1, Nfq
-!                         data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
-!                     end do
-!                 end do
-!             end do
-!             $:END_GPU_PARALLEL_LOOP()
-
-!             #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
-! #if defined(__PGI)
-!                 ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-! #else
-!                 ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
-!                 call hipCheck(hipDeviceSynchronize())
-! #endif
-!             #:endcall GPU_HOST_DATA
-
-!             $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
-!             do k = 1, sys_size
-!                 do j = 0, m
-!                     do l = 0, p
-!                         data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
-!                         q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
-!                     end do
-!                 end do
-!             end do
-!             $:END_GPU_PARALLEL_LOOP()
-!         end do
-
-! #else
-!         Nfq = 3
-!         do j = 0, m
-!             do k = 1, sys_size
-!                 data_fltr_cmplx(:) = (0_dp, 0_dp)
-!                 data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
-!                 call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-!                 data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-!                 call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-!                 data_real(:) = data_real(:)/real(real_size, dp)
-!                 q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
-!             end do
-!         end do
-
-!         ! Apply Fourier filter to additional rings
-!         do i = 1, fourier_rings
-!             Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
-!             do j = 0, m
-!                 do k = 1, sys_size
-!                     data_fltr_cmplx(:) = (0_dp, 0_dp)
-!                     data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
-!                     call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
-!                     data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
-!                     call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
-!                     data_real(:) = data_real(:)/real(real_size, dp)
-!                     q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
-!                 end do
-!             end do
-!         end do
-! #endif
+        integer :: i, j, k, l !< Generic loop iterators
+        integer :: ierr !< Generic flag used to identify and report GPU errors
+
+        ! Restrict filter to processors that have cells adjacent to axis
+        if (bc_y%beg >= 0) return
+#if defined(MFC_GPU)
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, cmplx_size
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, 0, l)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+#if defined(__PGI)
+            ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+#else
+            ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+            call hipCheck(hipDeviceSynchronize())
+#endif
+        #:endcall GPU_HOST_DATA
+        Nfq = 3
+        $:GPU_UPDATE(device='[Nfq]')
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 1, Nfq
+                    data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+#if defined(__PGI)
+            ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+#else
+            ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+            call hipCheck(hipDeviceSynchronize())
+#endif
+        #:endcall GPU_HOST_DATA
+
+        $:GPU_PARALLEL_LOOP(collapse=3)
+        do k = 1, sys_size
+            do j = 0, m
+                do l = 0, p
+                    data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                    q_cons_vf(k)%sf(j, 0, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                end do
+            end do
+        end do
+        $:END_GPU_PARALLEL_LOOP()
+
+        do i = 1, fourier_rings
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, cmplx_size
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = (0_dp, 0_dp)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = q_cons_vf(k)%sf(j, i, l)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+#if defined(__PGI)
+                ierr = cufftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+#else
+                ierr = hipfftExecD2Z(fwd_plan_gpu, data_real_gpu, data_cmplx_gpu)
+                call hipCheck(hipDeviceSynchronize())
+#endif
+            #:endcall GPU_HOST_DATA
+
+            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            $:GPU_UPDATE(device='[Nfq]')
+
+            $:GPU_PARALLEL_LOOP(collapse=3)
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 1, Nfq
+                        data_fltr_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size) = data_cmplx_gpu(l + j*cmplx_size + (k - 1)*cmplx_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+
+            #:call GPU_HOST_DATA(use_device_addr='[data_real_gpu, data_cmplx_gpu, data_fltr_cmplx_gpu]')
+#if defined(__PGI)
+                ierr = cufftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+#else
+                ierr = hipfftExecZ2D(bwd_plan_gpu, data_fltr_cmplx_gpu, data_real_gpu)
+                call hipCheck(hipDeviceSynchronize())
+#endif
+            #:endcall GPU_HOST_DATA
+
+            $:GPU_PARALLEL_LOOP(collapse=3, firstprivate='[i]')
+            do k = 1, sys_size
+                do j = 0, m
+                    do l = 0, p
+                        data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)/real(real_size, dp)
+                        q_cons_vf(k)%sf(j, i, l) = data_real_gpu(l + j*real_size + 1 + (k - 1)*real_size*x_size)
+                    end do
+                end do
+            end do
+            $:END_GPU_PARALLEL_LOOP()
+        end do
+
+#else
+        Nfq = 3
+        do j = 0, m
+            do k = 1, sys_size
+                data_fltr_cmplx(:) = (0_dp, 0_dp)
+                data_real(1:p + 1) = q_cons_vf(k)%sf(j, 0, 0:p)
+                call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+                data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+                call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+                data_real(:) = data_real(:)/real(real_size, dp)
+                q_cons_vf(k)%sf(j, 0, 0:p) = data_real(1:p + 1)
+            end do
+        end do
+
+        ! Apply Fourier filter to additional rings
+        do i = 1, fourier_rings
+            Nfq = min(floor(2_dp*real(i, dp)*pi), cmplx_size)
+            do j = 0, m
+                do k = 1, sys_size
+                    data_fltr_cmplx(:) = (0_dp, 0_dp)
+                    data_real(1:p + 1) = q_cons_vf(k)%sf(j, i, 0:p)
+                    call fftw_execute_dft_r2c(fwd_plan, data_real, data_cmplx)
+                    data_fltr_cmplx(1:Nfq) = data_cmplx(1:Nfq)
+                    call fftw_execute_dft_c2r(bwd_plan, data_fltr_cmplx, data_real)
+                    data_real(:) = data_real(:)/real(real_size, dp)
+                    q_cons_vf(k)%sf(j, i, 0:p) = data_real(1:p + 1)
+                end do
+            end do
+        end do
+#endif
 
     end subroutine s_apply_fourier_filter
 
@@ -301,25 +301,25 @@ contains
         !!      applying the Fourier filter in the azimuthal direction.
     impure subroutine s_finalize_fftw_module
 
-! #if defined(MFC_GPU)
-!         integer :: ierr !< Generic flag used to identify and report GPU errors
-!         @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
-! #if defined(__PGI)
-
-!         ierr = cufftDestroy(fwd_plan_gpu)
-!         ierr = cufftDestroy(bwd_plan_gpu)
-! #else
-!         ierr = hipfftDestroy(fwd_plan_gpu)
-!         ierr = hipfftDestroy(bwd_plan_gpu)
-! #endif
-! #else
-!         call fftw_free(fftw_real_data)
-!         call fftw_free(fftw_cmplx_data)
-!         call fftw_free(fftw_fltr_cmplx_data)
-
-!         call fftw_destroy_plan(fwd_plan)
-!         call fftw_destroy_plan(bwd_plan)
-! #endif
+#if defined(MFC_GPU)
+        integer :: ierr !< Generic flag used to identify and report GPU errors
+        @:DEALLOCATE(data_real_gpu, data_fltr_cmplx_gpu, data_cmplx_gpu)
+#if defined(__PGI)
+
+        ierr = cufftDestroy(fwd_plan_gpu)
+        ierr = cufftDestroy(bwd_plan_gpu)
+#else
+        ierr = hipfftDestroy(fwd_plan_gpu)
+        ierr = hipfftDestroy(bwd_plan_gpu)
+#endif
+#else
+        call fftw_free(fftw_real_data)
+        call fftw_free(fftw_cmplx_data)
+        call fftw_free(fftw_fltr_cmplx_data)
+
+        call fftw_destroy_plan(fwd_plan)
+        call fftw_destroy_plan(bwd_plan)
+#endif
 
     end subroutine s_finalize_fftw_module
 end module m_fftw

From 224a4ca32b76f28f986edc7e748604b2936a0bc1 Mon Sep 17 00:00:00 2001
From: Anand Radhakrishnan <aradhakr34@login05.frontier.olcf.ornl.gov>
Date: Tue, 27 Jan 2026 16:11:03 -0500
Subject: [PATCH 07/21] MPI and FFT fixed

---
 CMakeLists.txt | 11 ++++++-----
 load_amd.sh    |  1 +
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 77f398be7e..082bff3e65 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -460,7 +460,8 @@ function(MFC_SETUP_TARGET)
         if (MFC_MPI AND ARGS_MPI)
             find_package(MPI COMPONENTS Fortran REQUIRED)
 	    
-	    target_compile_definitions(${a_target} PRIVATE $ENV{CRAY_MPICH_INC})
+	    target_compile_definitions(${a_target} PRIVATE MFC_MPI)
+	    target_compile_options(${a_target} PRIVATE "$ENV{CRAY_MPICH_INC}")
             target_link_libraries     (${a_target} PRIVATE $ENV{CRAY_MPICH_LIB})
         endif()
 
@@ -481,10 +482,10 @@ function(MFC_SETUP_TARGET)
                     target_link_libraries(${a_target} PRIVATE CUDA::cudart CUDA::cufft)
                 else()
 			#find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
-                    target_link_libraries(${a_target} PRIVATE $ENV{CRAY_HIPFORT_INC})
+		    target_link_libraries(${a_target} PRIVATE $ENV{CRAY_HIPFORT_LIB})
 		endif()
             else()
-                find_package(FFTW REQUIRED)
+		find_package(FFTW REQUIRED)
                 target_link_libraries(${a_target} PRIVATE FFTW::FFTW)
             endif()
         endif()
@@ -527,8 +528,8 @@ function(MFC_SETUP_TARGET)
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-		    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a $ENV{CRAY_MPICH_INC})
-		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a  $ENV{CRAY_MPICH_LIB})
+		    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
                 endif()
             endif()
 
diff --git a/load_amd.sh b/load_amd.sh
index 7137c18fc4..4fffa3c536 100644
--- a/load_amd.sh
+++ b/load_amd.sh
@@ -11,6 +11,7 @@ export LD_LIBRARY_PATH=${OLCF_AFAR_ROOT}/lib:${OLCF_AFAR_ROOT}/lib/llvm/lib:${LD
 
 export CRAY_MPICH_INC="-I${OLCF_AFAR_ROOT}/include/mpich3.4a2"
 export CRAY_HIPFORT_INC="-I${OLCF_AFAR_ROOT}/include/hipfort/amdgcn"
+export CRAY_HIPFORT_LIB="-L${OLCF_AFAR_ROOT}/lib -lhipfft"
 export CRAY_HIP_INC="-I${OLCF_AFAR_ROOT}/include/hip"
 export CRAY_MPICH_LIB="-L${CRAY_MPICH_PREFIX}/lib \
                         ${CRAY_PMI_POST_LINK_OPTS} \

From 3e80c6538bd6adf48b1c3bbed4387512e18c2d02 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Wed, 28 Jan 2026 23:32:41 -0500
Subject: [PATCH 08/21] format

---
 src/common/m_boundary_common.fpp          |  106 +-
 src/common/m_chemistry.fpp                |   52 +-
 src/common/m_phase_change.fpp             |   70 +-
 src/common/m_variables_conversion.fpp     |   79 +-
 src/pre_process/m_global_parameters.fpp   |    2 +-
 src/simulation/include/inline_riemann.fpp |   73 +-
 src/simulation/m_acoustic_src.fpp         |   10 +-
 src/simulation/m_bubbles_EE.fpp           |   14 +-
 src/simulation/m_bubbles_EL.fpp           |   20 +-
 src/simulation/m_cbc.fpp                  |   51 +-
 src/simulation/m_compute_cbc.fpp          |  294 +++--
 src/simulation/m_data_output.fpp          |   84 +-
 src/simulation/m_hyperelastic.fpp         |   22 +-
 src/simulation/m_ibm.fpp                  |   36 +-
 src/simulation/m_igr.fpp                  | 1371 ++++++++++-----------
 src/simulation/m_pressure_relaxation.fpp  |   20 +-
 src/simulation/m_qbmm.fpp                 |   74 +-
 src/simulation/m_rhs.fpp                  |    8 +-
 src/simulation/m_riemann_solvers.fpp      |  508 ++++----
 src/simulation/m_sim_helpers.fpp          |   24 +-
 src/simulation/m_start_up.fpp             |    4 +-
 src/simulation/m_surface_tension.fpp      |   72 +-
 src/simulation/m_time_steppers.fpp        |   38 +-
 src/simulation/m_viscous.fpp              |   14 +-
 src/simulation/m_weno.fpp                 |  101 +-
 25 files changed, 1568 insertions(+), 1579 deletions(-)

diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index 8f5412f149..a0c4ed2455 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -54,7 +54,7 @@ contains
 
     impure subroutine s_initialize_boundary_common_module()
 
-        integer :: i, j 
+        integer :: i, j
 
         @:ALLOCATE(bc_buffers(1:3, 1:2))
 
@@ -62,20 +62,20 @@ contains
             @:ALLOCATE(bc_buffers(1, 1)%sf(1:sys_size, 0:n, 0:p))
             @:ALLOCATE(bc_buffers(1, 2)%sf(1:sys_size, 0:n, 0:p))
             #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (n > 0) then
-                @:ALLOCATE(bc_buffers(2,1)%sf(-buff_size:m+buff_size,1:sys_size,0:p))
-                @:ALLOCATE(bc_buffers(2,2)%sf(-buff_size:m+buff_size,1:sys_size,0:p))                
-                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                if (p > 0) then            
-                    @:ALLOCATE(bc_buffers(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
-                    @:ALLOCATE(bc_buffers(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
-                end if  
-                #:endif
-            end if
+                if (n > 0) then
+                    @:ALLOCATE(bc_buffers(2,1)%sf(-buff_size:m+buff_size,1:sys_size,0:p))
+                    @:ALLOCATE(bc_buffers(2,2)%sf(-buff_size:m+buff_size,1:sys_size,0:p))
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                        if (p > 0) then
+                            @:ALLOCATE(bc_buffers(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
+                            @:ALLOCATE(bc_buffers(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,1:sys_size))
+                        end if
+                    #:endif
+                end if
             #:endif
             do i = 1, num_dims
                 do j = 1, 2
-                @:ACC_SETUP_SFs(bc_buffers(i,j))
+                    @:ACC_SETUP_SFs(bc_buffers(i,j))
                 end do
             end do
 
@@ -160,7 +160,7 @@ contains
         if (n == 0) return
 
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-        
+
             if (bc_y%beg >= 0) then
                 call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, -1, sys_size, pb_in, mv_in)
             else
@@ -194,8 +194,6 @@ contains
                 $:END_GPU_PARALLEL_LOOP()
             end if
 
-         
-
             if (bc_y%end >= 0) then
                 call s_mpi_sendrecv_variables_buffers(q_prim_vf, 2, 1, sys_size, pb_in, mv_in)
             else
@@ -1063,41 +1061,41 @@ contains
                 end do
             end if
         elseif (bc_dir == 2) then !< y-direction
-#:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (bc_loc == -1) then !< bc_y%beg
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        q_prim_vf(i)%sf(k, -j, l) = &
-                            bc_buffers(2, 1)%sf(k, i, l)
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
+                if (bc_loc == -1) then !< bc_y%beg
+                    do i = 1, sys_size
+                        do j = 1, buff_size
+                            q_prim_vf(i)%sf(k, -j, l) = &
+                                bc_buffers(2, 1)%sf(k, i, l)
+                        end do
                     end do
-                end do
-            else !< bc_y%end
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        q_prim_vf(i)%sf(k, n + j, l) = &
-                            bc_buffers(2, 2)%sf(k, i, l)
+                else !< bc_y%end
+                    do i = 1, sys_size
+                        do j = 1, buff_size
+                            q_prim_vf(i)%sf(k, n + j, l) = &
+                                bc_buffers(2, 2)%sf(k, i, l)
+                        end do
                     end do
-                end do
-            end if
-#:endif
+                end if
+            #:endif
         elseif (bc_dir == 3) then !< z-direction
-#:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-            if (bc_loc == -1) then !< bc_z%beg
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        q_prim_vf(i)%sf(k, l, -j) = &
-                            bc_buffers(3, 1)%sf(k, l, i)
+            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                if (bc_loc == -1) then !< bc_z%beg
+                    do i = 1, sys_size
+                        do j = 1, buff_size
+                            q_prim_vf(i)%sf(k, l, -j) = &
+                                bc_buffers(3, 1)%sf(k, l, i)
+                        end do
                     end do
-                end do
-            else !< bc_z%end
-                do i = 1, sys_size
-                    do j = 1, buff_size
-                        q_prim_vf(i)%sf(k, l, p + j) = &
-                            bc_buffers(3, 2)%sf(k, l, i)
+                else !< bc_z%end
+                    do i = 1, sys_size
+                        do j = 1, buff_size
+                            q_prim_vf(i)%sf(k, l, p + j) = &
+                                bc_buffers(3, 2)%sf(k, l, i)
+                        end do
                     end do
-                end do
-            end if
-#:endif
+                end if
+            #:endif
         end if
 #else
         call s_ghost_cell_extrapolation(q_prim_vf, bc_dir, bc_loc, k, l)
@@ -2191,16 +2189,16 @@ contains
             deallocate (bc_buffers(1, 1)%sf)
             deallocate (bc_buffers(1, 2)%sf)
             #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            if (n > 0) then
-                deallocate (bc_buffers(2, 1)%sf)
-                deallocate (bc_buffers(2, 2)%sf)
-                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                if (p > 0) then
-                    deallocate (bc_buffers(3, 1)%sf)
-                    deallocate (bc_buffers(3, 2)%sf)
+                if (n > 0) then
+                    deallocate (bc_buffers(2, 1)%sf)
+                    deallocate (bc_buffers(2, 2)%sf)
+                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                        if (p > 0) then
+                            deallocate (bc_buffers(3, 1)%sf)
+                            deallocate (bc_buffers(3, 2)%sf)
+                        end if
+                    #:endif
                 end if
-                #:endif
-            end if
             #:endif
         end if
 
diff --git a/src/common/m_chemistry.fpp b/src/common/m_chemistry.fpp
index 4e89ea4a8f..69aba788db 100644
--- a/src/common/m_chemistry.fpp
+++ b/src/common/m_chemistry.fpp
@@ -126,13 +126,13 @@ contains
         integer :: eqn
         real(wp) :: T
         real(wp) :: rho, omega_m
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(10) :: Ys
-        real(wp), dimension(10) :: omega
-#:else 
-        real(wp), dimension(num_species) :: Ys
-        real(wp), dimension(num_species) :: omega
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(10) :: Ys
+            real(wp), dimension(10) :: omega
+        #:else
+            real(wp), dimension(num_species) :: Ys
+            real(wp), dimension(num_species) :: omega
+        #:endif
 
         $:GPU_PARALLEL_LOOP(collapse=3, private='[Ys, omega, eqn, T, rho, omega_m]', copyin='[bounds]')
         do z = bounds(3)%beg, bounds(3)%end
@@ -174,17 +174,17 @@ contains
         type(int_bounds_info), intent(in) :: irx, iry, irz
 
         integer, intent(in) :: idir
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
-        real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
-        real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
-        real(wp), dimension(10) :: Mass_Diffu_Flux, dYk_dxi
-#:else 
-        real(wp), dimension(num_species) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
-        real(wp), dimension(num_species) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
-        real(wp), dimension(num_species) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
-        real(wp), dimension(num_species) :: Mass_Diffu_Flux, dYk_dxi
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(10) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
+            real(wp), dimension(10) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
+            real(wp), dimension(10) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
+            real(wp), dimension(10) :: Mass_Diffu_Flux, dYk_dxi
+        #:else
+            real(wp), dimension(num_species) :: Xs_L, Xs_R, Xs_cell, Ys_L, Ys_R, Ys_cell
+            real(wp), dimension(num_species) :: mass_diffusivities_mixavg1, mass_diffusivities_mixavg2
+            real(wp), dimension(num_species) :: mass_diffusivities_mixavg_Cell, dXk_dxi, h_l, h_r, h_k
+            real(wp), dimension(num_species) :: Mass_Diffu_Flux, dYk_dxi
+        #:endif
 
         real(wp) :: Mass_Diffu_Energy
         real(wp) :: MW_L, MW_R, MW_cell, Rgas_L, Rgas_R, T_L, T_R, P_L, P_R, rho_L, rho_R, rho_cell, rho_Vic
@@ -268,11 +268,11 @@ contains
                             $:GPU_LOOP(parallelism='[seq]')
                             do i = chemxb, chemxe
                                 #:if USING_AMD
-                                h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights_nonparameter(i - chemxb + 1)
-                                h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights_nonparameter(i - chemxb + 1)
+                                    h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights_nonparameter(i - chemxb + 1)
+                                    h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights_nonparameter(i - chemxb + 1)
                                 #:else
-                                h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1)
-                                h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1)
+                                    h_l(i - chemxb + 1) = h_l(i - chemxb + 1)*gas_constant*T_L/molecular_weights(i - chemxb + 1)
+                                    h_r(i - chemxb + 1) = h_r(i - chemxb + 1)*gas_constant*T_R/molecular_weights(i - chemxb + 1)
                                 #:endif
                                 Xs_cell(i - chemxb + 1) = 0.5_wp*(Xs_L(i - chemxb + 1) + Xs_R(i - chemxb + 1))
                                 h_k(i - chemxb + 1) = 0.5_wp*(h_l(i - chemxb + 1) + h_r(i - chemxb + 1))
@@ -295,11 +295,11 @@ contains
                             $:GPU_LOOP(parallelism='[seq]')
                             do eqn = chemxb, chemxe
                                 #:if USING_AMD
-                                Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
-                                                                    molecular_weights_nonparameter(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
+                                    Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
+                                                                        molecular_weights_nonparameter(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
                                 #:else
-                                Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
-                                                                    molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
+                                    Mass_Diffu_Flux(eqn - chemxb + 1) = rho_cell*mass_diffusivities_mixavg_Cell(eqn - chemxb + 1)* &
+                                                                        molecular_weights(eqn - chemxb + 1)/MW_cell*dXk_dxi(eqn - chemxb + 1)
                                 #:endif
                                 rho_Vic = rho_Vic + Mass_Diffu_Flux(eqn - chemxb + 1)
                                 Mass_Diffu_Energy = Mass_Diffu_Energy + h_k(eqn - chemxb + 1)*Mass_Diffu_Flux(eqn - chemxb + 1)
diff --git a/src/common/m_phase_change.fpp b/src/common/m_phase_change.fpp
index b1505a44eb..67c1c698f3 100644
--- a/src/common/m_phase_change.fpp
+++ b/src/common/m_phase_change.fpp
@@ -91,11 +91,11 @@ contains
 
         ! $:GPU_DECLARE(create='[pS,pSOV,pSSL,TS,TSOV,TSSL,TSatOV,TSatSL]')
         ! $:GPU_DECLARE(create='[rhoe,dynE,rhos,rho,rM,m1,m2,MCT,TvF]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
-#:else 
-        real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+        #:else
+            real(wp), dimension(num_fluids) :: p_infOV, p_infpT, p_infSL, sk, hk, gk, ek, rhok
+        #:endif
         ! $:GPU_DECLARE(create='[p_infOV,p_infpT,p_infSL,sk,hk,gk,ek,rhok]')
 
         !< Generic loop iterators
@@ -238,23 +238,23 @@ contains
                     do i = 1, num_fluids
                         ! entropy
                         sk(i) = cvs(i)*log((TS**gs_min(i)) &
-                                                                 /((pS + ps_inf(i))**(gs_min(i) - 1.0_wp))) + qvps(i)
+                                           /((pS + ps_inf(i))**(gs_min(i) - 1.0_wp))) + qvps(i)
 
                         ! enthalpy
                         hk(i) = gs_min(i)*cvs(i)*TS &
-                                           + qvs(i)
+                                + qvs(i)
 
                         ! Gibbs-free energy
                         gk(i) = hk(i) - TS*sk(i)
 
                         ! densities
                         rhok(i) = (pS + ps_inf(i)) &
-                                             /((gs_min(i) - 1)*cvs(i)*TS)
+                                  /((gs_min(i) - 1)*cvs(i)*TS)
 
                         ! internal energy
                         ek(i) = (pS + gs_min(i) &
-                                            *ps_inf(i))/(pS + ps_inf(i)) &
-                                           *cvs(i)*TS + qvs(i)
+                                 *ps_inf(i))/(pS + ps_inf(i)) &
+                                *cvs(i)*TS + qvs(i)
                     end do
 
                     ! calculating volume fractions, internal energies, and total entropy
@@ -299,11 +299,11 @@ contains
         ! initializing variables
         integer, intent(in) :: j, k, l, MFL
         real(wp), intent(out) :: pS
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(out) :: p_infpT
-#:else
-        real(wp), dimension(num_fluids), intent(out) :: p_infpT 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(out) :: p_infpT
+        #:else
+            real(wp), dimension(num_fluids), intent(out) :: p_infpT
+        #:endif
         type(scalar_field), dimension(sys_size), intent(in) :: q_cons_vf
         real(wp), intent(in) :: rhoe
         real(wp), intent(out) :: TS
@@ -315,7 +315,7 @@ contains
         ! auxiliary variables for the pT-equilibrium solver
         mCP = 0.0_wp; mQ = 0.0_wp; p_infpT_sum = 0._wp
         $:GPU_LOOP(parallelism='[seq]')
-        do i = 1, num_fluids 
+        do i = 1, num_fluids
             p_infpT(i) = ps_inf(i)
             p_infpT_sum = p_infpT_sum + abs(p_infpT(i))
         end do
@@ -331,14 +331,14 @@ contains
 
         end do
 
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        if(num_fluids < 3) then
-            $:GPU_LOOP(parallelism='[seq]')
-            do i = num_fluids+1, 3 
-                p_infpT(i) = p_infpT_sum
-            end do
-        end if
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            if (num_fluids < 3) then
+                $:GPU_LOOP(parallelism='[seq]')
+                do i = num_fluids + 1, 3
+                    p_infpT(i) = p_infpT_sum
+                end do
+            end if
+        #:endif
 
         ! Checking energy constraint
         if ((rhoe - mQ - minval(p_infpT)) < 0.0_wp) then
@@ -417,19 +417,19 @@ contains
 
         integer, intent(in) :: j, k, l
         real(wp), intent(inout) :: pS
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(3), intent(in) :: p_infpT
-#:else
-        real(wp), dimension(num_fluids), intent(in) :: p_infpT 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: p_infpT
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: p_infpT
+        #:endif
         real(wp), intent(in) :: rhoe
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         real(wp), intent(inout) :: TS
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium
-#:else
-        real(wp), dimension(num_fluids) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium
+        #:else
+            real(wp), dimension(num_fluids) :: p_infpTg !< stiffness for the participating fluids for pTg-equilibrium
+        #:endif
         real(wp), dimension(2, 2) :: Jac, InvJac, TJac !< matrices for the Newton Solver
         real(wp), dimension(2) :: R2D, DeltamP !< residual and correction array
         real(wp) :: Om ! underrelaxation factor
@@ -582,7 +582,7 @@ contains
             TJac(2, 2) = Jac(2, 2)
 
             ! dividing by det(J)
-            InvJac = InvJac / (Jac(1, 1)*Jac(2, 2) - Jac(1, 2)*Jac(2, 1))
+            InvJac = InvJac/(Jac(1, 1)*Jac(2, 2) - Jac(1, 2)*Jac(2, 1))
 
             ! calculating correction array for Newton's method
             DeltamP = -1.0_wp*(matmul(InvJac, R2D))
diff --git a/src/common/m_variables_conversion.fpp b/src/common/m_variables_conversion.fpp
index bc20350c91..dddb286836 100644
--- a/src/common/m_variables_conversion.fpp
+++ b/src/common/m_variables_conversion.fpp
@@ -128,7 +128,6 @@ contains
         real(wp) :: E_e
         real(wp) :: e_Per_Kg, Pdyn_Per_Kg
         real(wp) :: T_guess
-        
 
         integer :: s !< Generic loop iterator
 
@@ -252,10 +251,9 @@ contains
         real(wp), intent(out), target :: qv
 
         real(wp), optional, dimension(2), intent(out) :: Re_K
-        real(wp), optional, intent(out) :: G_K 
+        real(wp), optional, intent(out) :: G_K
         real(wp), dimension(num_fluids) :: alpha_rho_K, alpha_K !<
         real(wp), optional, dimension(num_fluids), intent(in) :: G
-        
 
         integer :: i, j !< Generic loop iterator
 
@@ -323,17 +321,16 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), intent(out) :: rho_K, gamma_K, pi_inf_K, qv_K
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(inout) :: alpha_rho_K, alpha_K !<
-        real(wp), optional, dimension(3), intent(in) :: G
-#:else 
-        real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
-        real(wp), optional, dimension(num_fluids), intent(in) :: G
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(inout) :: alpha_rho_K, alpha_K !<
+            real(wp), optional, dimension(3), intent(in) :: G
+        #:else
+            real(wp), dimension(num_fluids), intent(inout) :: alpha_rho_K, alpha_K !<
+            real(wp), optional, dimension(num_fluids), intent(in) :: G
+        #:endif
         real(wp), dimension(2), intent(out) :: Re_K
         real(wp), optional, intent(out) :: G_K
         real(wp) :: alpha_K_sum
-        
 
         integer :: i, j !< Generic loop iterators
 
@@ -590,15 +587,15 @@ contains
         type(scalar_field), intent(inout) :: q_T_sf
         type(scalar_field), dimension(sys_size), intent(inout) :: qK_prim_vf
         type(int_bounds_info), dimension(1:3), intent(in) :: ibounds
-#:if USING_AMD and not MFC_CASE_OPTIMIZATION
-        real(wp), dimension(3) :: alpha_K, alpha_rho_K
-        real(wp), dimension(3) :: nRtmp
-        real(wp) :: rhoYks(1:10)
-#:else
-        real(wp), dimension(num_fluids) :: alpha_K, alpha_rho_K
-        real(wp), dimension(nb) :: nRtmp
-        real(wp) :: rhoYks(1:num_species) 
-#:endif
+        #:if USING_AMD and not MFC_CASE_OPTIMIZATION
+            real(wp), dimension(3) :: alpha_K, alpha_rho_K
+            real(wp), dimension(3) :: nRtmp
+            real(wp) :: rhoYks(1:10)
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_K, alpha_rho_K
+            real(wp), dimension(nb) :: nRtmp
+            real(wp) :: rhoYks(1:num_species)
+        #:endif
         real(wp), dimension(2) :: Re_K
         real(wp) :: rho_K, gamma_K, pi_inf_K, qv_K, dyn_pres_K
 
@@ -1190,17 +1187,17 @@ contains
         ! Partial densities, density, velocity, pressure, energy, advection
         ! variables, the specific heat ratio and liquid stiffness functions,
         ! the shear and volume Reynolds numbers and the Weber numbers
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_rho_K
-        real(wp), dimension(3) :: alpha_K
-        real(wp), dimension(3) :: vel_K
-        real(wp), dimension(10) :: Y_K
-#:else
-        real(wp), dimension(num_fluids) :: alpha_rho_K
-        real(wp), dimension(num_fluids) :: alpha_K
-        real(wp), dimension(num_vels) :: vel_K
-        real(wp), dimension(num_species) :: Y_K 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_K
+            real(wp), dimension(3) :: alpha_K
+            real(wp), dimension(3) :: vel_K
+            real(wp), dimension(10) :: Y_K
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_K
+            real(wp), dimension(num_fluids) :: alpha_K
+            real(wp), dimension(num_vels) :: vel_K
+            real(wp), dimension(num_species) :: Y_K
+        #:endif
         real(wp) :: rho_K
         real(wp) :: vel_K_sum
         real(wp) :: pres_K
@@ -1337,11 +1334,11 @@ contains
             & parallelism='[seq]', cray_inline=True)
         type(scalar_field), dimension(sys_size), intent(in) :: q_vf
         integer, intent(in) :: k, l, r
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
-#:else 
-        real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(out) :: alpha_rho_K, alpha_K
+        #:else
+            real(wp), dimension(num_fluids), intent(out) :: alpha_rho_K, alpha_K
+        #:endif
         integer :: i
         real(wp) :: alpha_K_sum
 
@@ -1411,11 +1408,11 @@ contains
         real(wp), intent(in) :: pres
         real(wp), intent(in) :: rho, gamma, pi_inf, qv
         real(wp), intent(in) :: H
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(in) :: adv
-#:else
-        real(wp), dimension(num_fluids), intent(in) :: adv 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: adv
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: adv
+        #:endif
         real(wp), intent(in) :: vel_sum
         real(wp), intent(in) :: c_c
         real(wp), intent(out) :: c
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index 8380f5bb02..01bd182e2d 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -300,7 +300,7 @@ module m_global_parameters
     !! to the next time-step.
 
     logical :: fft_wrt
-    logical :: dummy 
+    logical :: dummy
 
 contains
 
diff --git a/src/simulation/include/inline_riemann.fpp b/src/simulation/include/inline_riemann.fpp
index 5b4598c0aa..d8d0cc87c7 100644
--- a/src/simulation/include/inline_riemann.fpp
+++ b/src/simulation/include/inline_riemann.fpp
@@ -40,50 +40,49 @@
         eps = 0.001_wp
         call get_species_enthalpies_rt(T_L, h_iL)
         call get_species_enthalpies_rt(T_R, h_iR)
-#:if USING_AMD
-        h_iL = h_iL*gas_constant/molecular_weights_nonparameter*T_L
-        h_iR = h_iR*gas_constant/molecular_weights_nonparameter*T_R
-#:else 
-        h_iL = h_iL*gas_constant/molecular_weights*T_L
-        h_iR = h_iR*gas_constant/molecular_weights*T_R
-#:endif
+        #:if USING_AMD
+            h_iL = h_iL*gas_constant/molecular_weights_nonparameter*T_L
+            h_iR = h_iR*gas_constant/molecular_weights_nonparameter*T_R
+        #:else
+            h_iL = h_iL*gas_constant/molecular_weights*T_L
+            h_iR = h_iR*gas_constant/molecular_weights*T_R
+        #:endif
         call get_species_specific_heats_r(T_L, Cp_iL)
         call get_species_specific_heats_r(T_R, Cp_iR)
 
         h_avg_2 = (sqrt(rho_L)*h_iL + sqrt(rho_R)*h_iR)/(sqrt(rho_L) + sqrt(rho_R))
         Yi_avg = (sqrt(rho_L)*Ys_L + sqrt(rho_R)*Ys_R)/(sqrt(rho_L) + sqrt(rho_R))
         T_avg = (sqrt(rho_L)*T_L + sqrt(rho_R)*T_R)/(sqrt(rho_L) + sqrt(rho_R))
-#:if USING_AMD
-        if (abs(T_L - T_R) < eps) then
-            ! Case when T_L and T_R are very close
-            Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:))
-            Cv_avg = sum(Yi_avg(:)*((0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:) - gas_constant/molecular_weights_nonparameter(:)))
-        else
-            ! Normal calculation when T_L and T_R are sufficiently different
-            Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
-            Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights_nonparameter(:)))
-        end if
-        gamma_avg = Cp_avg/Cv_avg
-
-        Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights_nonparameter(:)*T_avg
-        c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
-#:else 
-        if (abs(T_L - T_R) < eps) then
-            ! Case when T_L and T_R are very close
-            Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights(:))
-            Cv_avg = sum(Yi_avg(:)*((0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights(:) - gas_constant/molecular_weights(:)))
-        else
-            ! Normal calculation when T_L and T_R are sufficiently different
-            Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
-            Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights(:)))
-        end if
-        gamma_avg = Cp_avg/Cv_avg
-
-        Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights(:)*T_avg
-        c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
-#:endif
+        #:if USING_AMD
+            if (abs(T_L - T_R) < eps) then
+                ! Case when T_L and T_R are very close
+                Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:))
+                Cv_avg = sum(Yi_avg(:)*((0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights_nonparameter(:) - gas_constant/molecular_weights_nonparameter(:)))
+            else
+                ! Normal calculation when T_L and T_R are sufficiently different
+                Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
+                Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights_nonparameter(:)))
+            end if
+            gamma_avg = Cp_avg/Cv_avg
+
+            Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights_nonparameter(:)*T_avg
+            c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
+        #:else
+            if (abs(T_L - T_R) < eps) then
+                ! Case when T_L and T_R are very close
+                Cp_avg = sum(Yi_avg(:)*(0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights(:))
+                Cv_avg = sum(Yi_avg(:)*((0.5_wp*Cp_iL(:) + 0.5_wp*Cp_iR(:))*gas_constant/molecular_weights(:) - gas_constant/molecular_weights(:)))
+            else
+                ! Normal calculation when T_L and T_R are sufficiently different
+                Cp_avg = sum(Yi_avg(:)*(h_iR(:) - h_iL(:))/(T_R - T_L))
+                Cv_avg = sum(Yi_avg(:)*((h_iR(:) - h_iL(:))/(T_R - T_L) - gas_constant/molecular_weights(:)))
+            end if
+            gamma_avg = Cp_avg/Cv_avg
+
+            Phi_avg(:) = (gamma_avg - 1._wp)*(vel_avg_rms/2.0_wp - h_avg_2(:)) + gamma_avg*gas_constant/molecular_weights(:)*T_avg
+            c_sum_Yi_Phi = sum(Yi_avg(:)*Phi_avg(:))
+        #:endif
 
-        
     end if
 
 #:enddef roe_avg
diff --git a/src/simulation/m_acoustic_src.fpp b/src/simulation/m_acoustic_src.fpp
index 458f8dd514..75a61104da 100644
--- a/src/simulation/m_acoustic_src.fpp
+++ b/src/simulation/m_acoustic_src.fpp
@@ -143,11 +143,11 @@ contains
         type(scalar_field), dimension(sys_size), intent(inout) :: rhs_vf
 
         integer, intent(in) :: t_step
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: myalpha, myalpha_rho
-#:else 
-        real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: myalpha, myalpha_rho
+        #:else
+            real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
+        #:endif
         real(wp) :: myRho, B_tait
         real(wp) :: sim_time, c, small_gamma
         real(wp) :: frequency_local, gauss_sigma_time_local
diff --git a/src/simulation/m_bubbles_EE.fpp b/src/simulation/m_bubbles_EE.fpp
index 6fda2da3b1..dfc36f2a32 100644
--- a/src/simulation/m_bubbles_EE.fpp
+++ b/src/simulation/m_bubbles_EE.fpp
@@ -165,13 +165,13 @@ contains
         real(wp) :: rddot
         real(wp) :: pb_local, mv_local, vflux, pbdot
         real(wp) :: n_tait, B_tait
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: Rtmp, Vtmp
-        real(wp), dimension(3) :: myalpha, myalpha_rho
-#:else 
-        real(wp), dimension(nb) :: Rtmp, Vtmp
-        real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: Rtmp, Vtmp
+            real(wp), dimension(3) :: myalpha, myalpha_rho
+        #:else
+            real(wp), dimension(nb) :: Rtmp, Vtmp
+            real(wp), dimension(num_fluids) :: myalpha, myalpha_rho
+        #:endif
         real(wp) :: myR, myV, alf, myP, myRho, R2Vav, R3
         real(wp) :: nbub !< Bubble number density
         real(wp) :: my_divu
diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index 28e9c8c27b..b3917c3488 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -537,11 +537,11 @@ contains
         real(wp) :: myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot
         real(wp) :: myPinf, aux1, aux2, myCson, myRho
         real(wp) :: gamma, pi_inf, qv
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: myalpha_rho, myalpha
-#:else 
-        real(wp), dimension(num_fluids) :: myalpha_rho, myalpha
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: myalpha_rho, myalpha
+        #:else
+            real(wp), dimension(num_fluids) :: myalpha_rho, myalpha
+        #:endif
         real(wp), dimension(2) :: Re
         integer, dimension(3) :: cell
 
@@ -782,11 +782,11 @@ contains
         real(wp), intent(out) :: cson
 
         real(wp) :: E, H
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: vel
-#:else 
-        real(wp), dimension(num_dims) :: vel
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: vel
+        #:else
+            real(wp), dimension(num_dims) :: vel
+        #:endif
         integer :: i
 
         $:GPU_LOOP(parallelism='[seq]')
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index bd0abfbd20..4dc7d936fb 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -650,28 +650,28 @@ contains
         real(wp) :: dpi_inf_dt
         real(wp) :: dqv_dt
         real(wp) :: dpres_ds
-#:if USING_AMD 
-        real(wp), dimension(12) :: L
-#:else 
-        real(wp), dimension(sys_size) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_rho, dalpha_rho_ds, mf
-        real(wp), dimension(3) :: vel, dvel_ds
-        real(wp), dimension(3) :: adv_local, dadv_ds
-        real(wp), dimension(3) :: dadv_dt
-        real(wp), dimension(3) :: dvel_dt
-        real(wp), dimension(3) :: dalpha_rho_dt
-        real(wp), dimension(10) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_rho, dalpha_rho_ds, mf
-        real(wp), dimension(num_vels) :: vel, dvel_ds
-        real(wp), dimension(num_fluids) :: adv_local, dadv_ds
-        real(wp), dimension(num_fluids) :: dadv_dt
-        real(wp), dimension(num_dims) :: dvel_dt
-        real(wp), dimension(num_fluids) :: dalpha_rho_dt
-        real(wp), dimension(num_species) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12) :: L
+        #:else
+            real(wp), dimension(sys_size) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho, dalpha_rho_ds, mf
+            real(wp), dimension(3) :: vel, dvel_ds
+            real(wp), dimension(3) :: adv_local, dadv_ds
+            real(wp), dimension(3) :: dadv_dt
+            real(wp), dimension(3) :: dvel_dt
+            real(wp), dimension(3) :: dalpha_rho_dt
+            real(wp), dimension(10) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho, dalpha_rho_ds, mf
+            real(wp), dimension(num_vels) :: vel, dvel_ds
+            real(wp), dimension(num_fluids) :: adv_local, dadv_ds
+            real(wp), dimension(num_fluids) :: dadv_dt
+            real(wp), dimension(num_dims) :: dvel_dt
+            real(wp), dimension(num_fluids) :: dalpha_rho_dt
+            real(wp), dimension(num_species) :: Ys, h_k, dYs_dt, dYs_ds, Xs, Gamma_i, Cp_i
+        #:endif
         real(wp), dimension(2) :: Re_cbc
         real(wp), dimension(3) :: lambda
 
@@ -686,7 +686,6 @@ contains
         real(wp) :: Ma
         real(wp) :: T, sum_Enthalpies
         real(wp) :: Cv, Cp, e_mix, Mw, R_gas
-        
 
         real(wp) :: vel_K_sum, vel_dv_dt_sum
 
@@ -744,8 +743,8 @@ contains
                     $:END_GPU_PARALLEL_LOOP()
                 end if
 
-                    ! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2
-                if(weno_order == 5 .or. dummy) then
+                ! PI4 of flux_rs_vf and flux_src_rs_vf at j = 1/2, 3/2
+                if (weno_order == 5 .or. dummy) then
                     call s_convert_primitive_to_flux_variables(q_prim_rs${XYZ}$_vf, &
                                                                F_rs${XYZ}$_vf, &
                                                                F_src_rs${XYZ}$_vf, &
@@ -1070,7 +1069,7 @@ contains
                             sum_Enthalpies = 0._wp
                             $:GPU_LOOP(parallelism='[seq]')
                             do i = 1, num_species
-                                
+
                                 #:if USING_AMD
                                     h_k(i) = h_k(i)*gas_constant/molecular_weights_nonparameter(i)*T
                                     sum_Enthalpies = sum_Enthalpies + (rho*h_k(i) - pres*Mw/molecular_weights_nonparameter(i)*Cp/R_gas)*dYs_dt(i)
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index aa0dd699f6..8c6a1d9fca 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -23,11 +23,11 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
         real(wp), dimension(3), intent(in) :: lambda
         real(wp), intent(in) :: rho, c, dpres_ds
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(in) :: dvel_ds
-#:else
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: dvel_ds
+        #:else
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        #:endif
         real(wp) :: L1
         L1 = lambda(1)*(dpres_ds - rho*c*dvel_ds(dir_idx(1)))
     end function f_base_L1
@@ -35,16 +35,16 @@ contains
     !> Fill density L variables
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-#:else   
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+        #:endif
         real(wp), intent(in) :: lambda_factor, lambda2, c
         real(wp), intent(in) :: dpres_ds
         integer :: i
@@ -58,16 +58,16 @@ contains
     !> Fill velocity L variables
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
-        real(wp), dimension(3), intent(in) :: dvel_ds
-#:else         
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: dvel_ds
+        #:else
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
@@ -80,16 +80,16 @@ contains
     !> Fill advection L variables
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
-        real(wp), dimension(3), intent(in) :: dadv_ds
-#:else         
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: dadv_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+        #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
@@ -102,16 +102,16 @@ contains
     !> Fill chemistry L variables
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
-        real(wp), dimension(10), intent(in) :: dYs_ds
-#:else    
-        real(wp), dimension(num_species), intent(in) :: dYs_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(10), intent(in) :: dYs_ds
+        #:else
+            real(wp), dimension(num_species), intent(in) :: dYs_ds
+        #:endif
         real(wp), intent(in) :: lambda_factor, lambda2
         integer :: i
 
@@ -129,16 +129,16 @@ contains
             & cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
-        real(wp), dimension(3), intent(in) :: dvel_ds
-#:else        
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: dvel_ds
+        #:else
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
         integer :: i
 
@@ -153,22 +153,22 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD      
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(3), intent(in) :: dvel_ds
-        real(wp), dimension(3), intent(in) :: dadv_ds
-        real(wp), dimension(10), intent(in) :: dYs_ds
-#:else      
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-        real(wp), dimension(num_species), intent(in) :: dYs_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(3), intent(in) :: dvel_ds
+            real(wp), dimension(3), intent(in) :: dadv_ds
+            real(wp), dimension(10), intent(in) :: dYs_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(num_species), intent(in) :: dYs_ds
+        #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
 
@@ -193,18 +193,17 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
-        real(wp), dimension(3), intent(in) :: dvel_ds
-#:else      
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds 
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: dvel_ds
+        #:else
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+        #:endif
         real(wp), intent(in) :: rho, c, dpres_ds
-        
 
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         L(2:advxe) = 0._wp
@@ -217,22 +216,22 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(3), intent(in) :: dvel_ds
-        real(wp), dimension(3), intent(in) :: dadv_ds
-        real(wp), dimension(10), intent(in) :: dYs_ds
-#:else        
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-        real(wp), dimension(num_species), intent(in) :: dYs_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(3), intent(in) :: dvel_ds
+            real(wp), dimension(3), intent(in) :: dadv_ds
+            real(wp), dimension(10), intent(in) :: dYs_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(num_species), intent(in) :: dYs_ds
+        #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
 
@@ -250,20 +249,20 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(3), intent(in) :: dvel_ds
-        real(wp), dimension(3), intent(in) :: dadv_ds
-#:else         
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(3), intent(in) :: dvel_ds
+            real(wp), dimension(3), intent(in) :: dadv_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+        #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
 
@@ -280,24 +279,23 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD       
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(3), intent(in) :: dvel_ds
-        real(wp), dimension(3), intent(in) :: dadv_ds
-#:else        
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(3), intent(in) :: dvel_ds
+            real(wp), dimension(3), intent(in) :: dadv_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+        #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
 
-
         L(1) = f_base_L1(lambda, rho, c, dpres_ds, dvel_ds)
         call s_fill_density_L(L, 1._wp, lambda(2), c, mf, dalpha_rho_ds, dpres_ds)
         call s_fill_velocity_L(L, 1._wp, lambda(2), dvel_ds)
@@ -309,11 +307,11 @@ contains
     subroutine s_compute_supersonic_inflow_L(L)
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
             & parallelism='[seq]', cray_inline=True)
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
         L(1:advxe) = 0._wp
         if (chemistry) L(chemxb:chemxe) = 0._wp
     end subroutine s_compute_supersonic_inflow_L
@@ -324,22 +322,22 @@ contains
             & parallelism='[seq]', cray_inline=True)
 
         real(wp), dimension(3), intent(in) :: lambda
-#:if USING_AMD 
-        real(wp), dimension(12), intent(inout) :: L
-#:else 
-        real(wp), dimension(sys_size), intent(inout) :: L 
-#:endif
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(3), intent(in) :: dvel_ds
-        real(wp), dimension(3), intent(in) :: dadv_ds
-        real(wp), dimension(10), intent(in) :: dYs_ds
-#:else  
-        real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
-        real(wp), dimension(num_dims), intent(in) :: dvel_ds
-        real(wp), dimension(num_fluids), intent(in) :: dadv_ds
-        real(wp), dimension(num_species), intent(in) :: dYs_ds
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(12), intent(inout) :: L
+        #:else
+            real(wp), dimension(sys_size), intent(inout) :: L
+        #:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(3), intent(in) :: dvel_ds
+            real(wp), dimension(3), intent(in) :: dadv_ds
+            real(wp), dimension(10), intent(in) :: dYs_ds
+        #:else
+            real(wp), dimension(num_fluids), intent(in) :: mf, dalpha_rho_ds
+            real(wp), dimension(num_dims), intent(in) :: dvel_ds
+            real(wp), dimension(num_fluids), intent(in) :: dadv_ds
+            real(wp), dimension(num_species), intent(in) :: dYs_ds
+        #:endif
         real(wp), intent(in) :: rho, c
         real(wp), intent(in) :: dpres_ds
 
diff --git a/src/simulation/m_data_output.fpp b/src/simulation/m_data_output.fpp
index 97c1ce5a4d..6e97af027c 100644
--- a/src/simulation/m_data_output.fpp
+++ b/src/simulation/m_data_output.fpp
@@ -268,13 +268,13 @@ contains
         integer, intent(in) :: t_step
 
         real(wp) :: rho        !< Cell-avg. density
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
-        real(wp), dimension(3) :: vel        !< Cell-avg. velocity
-#:else
-        real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
-        real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+        #:else
+            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
+            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
+        #:endif
         real(wp) :: vel_sum    !< Cell-avg. velocity sum
         real(wp) :: pres       !< Cell-avg. pressure
         real(wp) :: gamma      !< Cell-avg. sp. heat ratio
@@ -1574,30 +1574,30 @@ contains
                 elseif (p == 0) then
                     if (bubbles_euler) then
                         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                        write (i + 30, '(6X,10F24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            vel(2), &
-                            pres, &
-                            alf, &
-                            nR(1), &
-                            nRdot(1), &
-                            R(1), &
-                            Rdot(1)
+                            write (i + 30, '(6X,10F24.8)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                vel(2), &
+                                pres, &
+                                alf, &
+                                nR(1), &
+                                nRdot(1), &
+                                R(1), &
+                                Rdot(1)
                         #:endif
                     else if (elasticity) then
                         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
-                               'F24.8,F24.8,F24.8)') &
-                            nondim_time, &
-                            rho, &
-                            vel(1), &
-                            vel(2), &
-                            pres, &
-                            tau_e(1), &
-                            tau_e(2), &
-                            tau_e(3)
+                            write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
+                                   'F24.8,F24.8,F24.8)') &
+                                nondim_time, &
+                                rho, &
+                                vel(1), &
+                                vel(2), &
+                                pres, &
+                                tau_e(1), &
+                                tau_e(2), &
+                                tau_e(3)
                         #:endif
                     else
                         write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8)') &
@@ -1609,20 +1609,20 @@ contains
                     end if
                 else
                     #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                    write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
-                           'F24.8,F24.8,F24.8,F24.8,F24.8,'// &
-                           'F24.8)') &
-                        nondim_time, &
-                        rho, &
-                        vel(1), &
-                        vel(2), &
-                        vel(3), &
-                        pres, &
-                        gamma, &
-                        pi_inf, &
-                        qv, &
-                        c, &
-                        accel
+                        write (i + 30, '(6X,F12.6,F24.8,F24.8,F24.8,F24.8,'// &
+                               'F24.8,F24.8,F24.8,F24.8,F24.8,'// &
+                               'F24.8)') &
+                            nondim_time, &
+                            rho, &
+                            vel(1), &
+                            vel(2), &
+                            vel(3), &
+                            pres, &
+                            gamma, &
+                            pi_inf, &
+                            qv, &
+                            c, &
+                            accel
                     #:endif
                 end if
             end if
diff --git a/src/simulation/m_hyperelastic.fpp b/src/simulation/m_hyperelastic.fpp
index 52914d9938..cc59b8ba90 100644
--- a/src/simulation/m_hyperelastic.fpp
+++ b/src/simulation/m_hyperelastic.fpp
@@ -98,17 +98,17 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         type(scalar_field), dimension(sys_size), intent(inout) :: q_prim_vf
-#:if USING_AMD
-        real(wp), dimension(9) :: tensora, tensorb
-#:else 
-        real(wp), dimension(tensor_size) :: tensora, tensorb
-#:endif
-
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_k, alpha_rho_k
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(9) :: tensora, tensorb
+        #:else
+            real(wp), dimension(tensor_size) :: tensora, tensorb
+        #:endif
+
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_k, alpha_rho_k
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_k, alpha_rho_k
+        #:endif
         real(wp), dimension(2) :: Re
         real(wp) :: rho, gamma, pi_inf, qv
         real(wp) :: G_local
diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 1d4d712226..bc023f4ecc 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -179,19 +179,19 @@ contains
         real(wp) :: pres_IP
         real(wp), dimension(3) :: vel_IP, vel_norm_IP
         real(wp) :: c_IP
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD        
-        real(wp), dimension(3) :: Gs
-        real(wp), dimension(3) :: alpha_rho_IP, alpha_IP
-        real(wp), dimension(3) :: r_IP, v_IP, pb_IP, mv_IP
-        real(wp), dimension(18) :: nmom_IP
-        real(wp), dimension(12) :: presb_IP, massv_IP
-#:else 
-        real(wp), dimension(num_fluids) :: Gs
-        real(wp), dimension(num_fluids) :: alpha_rho_IP, alpha_IP
-        real(wp), dimension(nb) :: r_IP, v_IP, pb_IP, mv_IP
-        real(wp), dimension(nb*nmom) :: nmom_IP
-        real(wp), dimension(nb*nnode) :: presb_IP, massv_IP
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: Gs
+            real(wp), dimension(3) :: alpha_rho_IP, alpha_IP
+            real(wp), dimension(3) :: r_IP, v_IP, pb_IP, mv_IP
+            real(wp), dimension(18) :: nmom_IP
+            real(wp), dimension(12) :: presb_IP, massv_IP
+        #:else
+            real(wp), dimension(num_fluids) :: Gs
+            real(wp), dimension(num_fluids) :: alpha_rho_IP, alpha_IP
+            real(wp), dimension(nb) :: r_IP, v_IP, pb_IP, mv_IP
+            real(wp), dimension(nb*nmom) :: nmom_IP
+            real(wp), dimension(nb*nnode) :: presb_IP, massv_IP
+        #:endif
         !! Primitive variables at the image point associated with a ghost point,
         !! interpolated from surrounding fluid cells.
 
@@ -862,11 +862,11 @@ contains
         real(wp), intent(INOUT) :: pres_IP
         real(wp), dimension(3), intent(INOUT) :: vel_IP
         real(wp), intent(INOUT) :: c_IP
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3), intent(INOUT) :: alpha_IP, alpha_rho_IP
-#:else 
-        real(wp), dimension(num_fluids), intent(INOUT) :: alpha_IP, alpha_rho_IP
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3), intent(INOUT) :: alpha_IP, alpha_rho_IP
+        #:else
+            real(wp), dimension(num_fluids), intent(INOUT) :: alpha_IP, alpha_rho_IP
+        #:endif
         real(wp), optional, dimension(:), intent(INOUT) :: r_IP, v_IP, pb_IP, mv_IP
         real(wp), optional, dimension(:), intent(INOUT) :: nmom_IP
         real(wp), optional, dimension(:), intent(INOUT) :: presb_IP, massv_IP
diff --git a/src/simulation/m_igr.fpp b/src/simulation/m_igr.fpp
index 7e27a51de9..8aff221313 100644
--- a/src/simulation/m_igr.fpp
+++ b/src/simulation/m_igr.fpp
@@ -57,36 +57,36 @@ module m_igr
 
 #if defined(MFC_OpenMP)
             real(wp) :: coeff_L(-1:3) = [ &
-                            -3._wp/60._wp, &  ! Index -1
-                            27._wp/60._wp, &  ! Index 0
-                            47._wp/60._wp, &  ! Index 1
-                            -13._wp/60._wp, &  ! Index 2
-                            2._wp/60._wp &  ! Index 3
-                            ]
+                        -3._wp/60._wp, &  ! Index -1
+                        27._wp/60._wp, &  ! Index 0
+                        47._wp/60._wp, &  ! Index 1
+                        -13._wp/60._wp, &  ! Index 2
+                        2._wp/60._wp &  ! Index 3
+                        ]
 
             real(wp) :: coeff_R(-2:2) = [ &
-                            2._wp/60._wp, &  ! Index -2
-                            -13._wp/60._wp, &  ! Index -1
-                            47._wp/60._wp, &  ! Index 0
-                            27._wp/60._wp, &  ! Index 1
-                            -3._wp/60._wp &  ! Index 2
-                            ]
+                        2._wp/60._wp, &  ! Index -2
+                        -13._wp/60._wp, &  ! Index -1
+                        47._wp/60._wp, &  ! Index 0
+                        27._wp/60._wp, &  ! Index 1
+                        -3._wp/60._wp &  ! Index 2
+                        ]
 #else
             real(wp), parameter :: coeff_L(-1:3) = [ &
-                                       -3._wp/60._wp, &  ! Index -1
-                                       27._wp/60._wp, &  ! Index 0
-                                       47._wp/60._wp, &  ! Index 1
-                                       -13._wp/60._wp, &  ! Index 2
-                                       2._wp/60._wp &  ! Index 3
-                                       ]
+                                   -3._wp/60._wp, &  ! Index -1
+                                   27._wp/60._wp, &  ! Index 0
+                                   47._wp/60._wp, &  ! Index 1
+                                   -13._wp/60._wp, &  ! Index 2
+                                   2._wp/60._wp &  ! Index 3
+                                   ]
 
             real(wp), parameter :: coeff_R(-2:2) = [ &
-                                       2._wp/60._wp, &  ! Index -2
-                                       -13._wp/60._wp, &  ! Index -1
-                                       47._wp/60._wp, &  ! Index 0
-                                       27._wp/60._wp, &  ! Index 1
-                                       -3._wp/60._wp &  ! Index 2
-                                       ]
+                                   2._wp/60._wp, &  ! Index -2
+                                   -13._wp/60._wp, &  ! Index -1
+                                   47._wp/60._wp, &  ! Index 0
+                                   27._wp/60._wp, &  ! Index 1
+                                   -3._wp/60._wp &  ! Index 2
+                                   ]
 #endif
         #:elif igr_order == 3
             integer, parameter :: vidxb = -1
@@ -94,26 +94,26 @@ module m_igr
 
 #if defined(MFC_OpenMP)
             real(wp) :: coeff_L(0:2) = [ &
-                            2._wp/6._wp, & ! Index 0
-                            5._wp/6._wp, & ! Index 1
-                            -1._wp/6._wp & ! Index 2
-                            ]
+                        2._wp/6._wp, & ! Index 0
+                        5._wp/6._wp, & ! Index 1
+                        -1._wp/6._wp & ! Index 2
+                        ]
             real(wp) :: coeff_R(-1:1) = [ &
-                            -1._wp/6._wp, & ! Index -1
-                            5._wp/6._wp, & ! Index 0
-                            2._wp/6._wp & ! Index 1
-                            ]
+                        -1._wp/6._wp, & ! Index -1
+                        5._wp/6._wp, & ! Index 0
+                        2._wp/6._wp & ! Index 1
+                        ]
 #else
             real(wp), parameter :: coeff_L(0:2) = [ &
-                                       2._wp/6._wp, & ! Index 0
-                                       5._wp/6._wp, & ! Index 1
-                                       -1._wp/6._wp & ! Index 2
-                                       ]
+                                   2._wp/6._wp, & ! Index 0
+                                   5._wp/6._wp, & ! Index 1
+                                   -1._wp/6._wp & ! Index 2
+                                   ]
             real(wp), parameter :: coeff_R(-1:1) = [ &
-                                       -1._wp/6._wp, & ! Index -1
-                                       5._wp/6._wp, & ! Index 0
-                                       2._wp/6._wp & ! Index 1
-                                       ]
+                                   -1._wp/6._wp, & ! Index -1
+                                   5._wp/6._wp, & ! Index 0
+                                   2._wp/6._wp & ! Index 1
+                                   ]
 #endif
         #:endif
 
@@ -217,10 +217,10 @@ contains
 
         #:if not MFC_CASE_OPTIMIZATION
             if (igr_order == 3) then
-                vidxb = -1; vidxe = 2;
+                vidxb = -1; vidxe = 2; 
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
-                 @:ALLOCATE(coeff_L(0:2))
+                @:ALLOCATE(coeff_L(0:2))
                 coeff_L(0) = (2._wp/6._wp)
                 coeff_L(1) = (5._wp/6._wp)
                 coeff_L(2) = (-1._wp/6._wp)
@@ -230,9 +230,8 @@ contains
                 coeff_R(0) = (5._wp/6._wp)
                 coeff_R(-1) = (-1._wp/6._wp)
 
-
             elseif (igr_order == 5) then
-                vidxb = -2; vidxe = 3;
+                vidxb = -2; vidxe = 3; 
                 $:GPU_UPDATE(device='[vidxb, vidxe]')
 
                 @:ALLOCATE(coeff_L(-1:3))
@@ -482,420 +481,420 @@ contains
         if (idir == 1) then
             if (p == 0) then
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
-                do l = 0, p
-                    do k = 0, n
-                        do j = -1, m
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
+                    do l = 0, p
+                        do k = 0, n
+                            do j = -1, m
 
-                            vflux_L_arr = 0._wp
-                            vflux_R_arr = 0._wp
+                                vflux_L_arr = 0._wp
+                                vflux_R_arr = 0._wp
 
-                            #:if MFC_CASE_OPTIMIZATION
-                                #:if igr_order == 5
-                                    !DIR$ unroll 6
-                                #:elif igr_order == 3
-                                    !DIR$ unroll 4
+                                #:if MFC_CASE_OPTIMIZATION
+                                    #:if igr_order == 5
+                                        !DIR$ unroll 6
+                                    #:elif igr_order == 3
+                                        !DIR$ unroll 4
+                                    #:endif
                                 #:endif
-                            #:endif
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb, vidxe
-                                !x-direction contributions
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = -1, 1
-                                    rho_L = 0._wp
+                                do q = vidxb, vidxe
+                                    !x-direction contributions
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do r = 1, num_fluids
-                                        rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l)
+                                    do i = -1, 1
+                                        rho_L = 0._wp
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do r = 1, num_fluids
+                                            rho_L = rho_L + q_cons_vf(r)%sf(j + i + q, k, l)
+                                        end do
+                                        rho_sf_small(i) = rho_L
                                     end do
-                                    rho_sf_small(i) = rho_L
-                                end do
 
-                                dvel_small(1) = (1/(2._wp*dx(j)))*( &
-                                                1._wp*q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - &
-                                                1._wp*q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1))
-                                dvel_small(2) = (1/(2._wp*dx(j)))*( &
-                                                q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - &
-                                                q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1))
+                                    dvel_small(1) = (1/(2._wp*dx(j)))*( &
+                                                    1._wp*q_cons_vf(momxb)%sf(j + 1 + q, k, l)/rho_sf_small(1) - &
+                                                    1._wp*q_cons_vf(momxb)%sf(j - 1 + q, k, l)/rho_sf_small(-1))
+                                    dvel_small(2) = (1/(2._wp*dx(j)))*( &
+                                                    q_cons_vf(momxb + 1)%sf(j + 1 + q, k, l)/rho_sf_small(1) - &
+                                                    q_cons_vf(momxb + 1)%sf(j - 1 + q, k, l)/rho_sf_small(-1))
+
+                                    if (q == 0) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_dims
+                                            dvel(i, 1) = dvel_small(i)
+                                        end do
+                                    end if
+
+                                    if (q > vidxb) then
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp
+                                    end if
+                                    if (q < vidxe) then
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp
+                                    end if
 
-                                if (q == 0) then
+                                    !y-direction contributions
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_dims
-                                        dvel(i, 1) = dvel_small(i)
+                                    do i = -1, 1
+                                        rho_L = 0._wp
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do r = 1, num_fluids
+                                            rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l)
+                                        end do
+                                        rho_sf_small(i) = rho_L
                                     end do
-                                end if
 
-                                if (q > vidxb) then
-                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
-                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(1))/3._wp
-                                end if
-                                if (q < vidxe) then
-                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
-                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(1))/3._wp
-                                end if
+                                    dvel_small(1) = (1/(2._wp*dy(k)))*( &
+                                                    q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - &
+                                                    q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1))
+                                    dvel_small(2) = (1/(2._wp*dy(k)))*( &
+                                                    q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - &
+                                                    q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1))
+
+                                    if (q == 0) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_dims
+                                            dvel(i, 2) = dvel_small(i)
+                                        end do
+                                    end if
+
+                                    if (q > vidxb) then
+                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp
+                                    end if
+                                    if (q < vidxe) then
+                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp
+                                    end if
+
+                                    if (q == 0) then
+                                        jac_rhs(j, k, l) = real(alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1)) &
+                                                                         + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp &
+                                                                         + (dvel(1, 1) + dvel(2, 2))**2._wp), kind=stp)
+                                    end if
+                                end do
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_fluids
+                                    alpha_rho_L(i) = 0._wp
+                                    alpha_rho_R(i) = 0._wp
+                                    alpha_L(i) = 0._wp
+                                    alpha_R(i) = 0._wp
+                                end do
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = 0._wp
+                                    vel_R(i) = 0._wp
+                                end do
 
-                                !y-direction contributions
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = -1, 1
-                                    rho_L = 0._wp
+                                do q = vidxb + 1, vidxe
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do r = 1, num_fluids
-                                        rho_L = rho_L + q_cons_vf(r)%sf(j + q, k + i, l)
+                                    do i = 1, num_fluids
+                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l)
                                     end do
-                                    rho_sf_small(i) = rho_L
-                                end do
 
-                                dvel_small(1) = (1/(2._wp*dy(k)))*( &
-                                                q_cons_vf(momxb)%sf(j + q, k + 1, l)/rho_sf_small(1) - &
-                                                q_cons_vf(momxb)%sf(j + q, k - 1, l)/rho_sf_small(-1))
-                                dvel_small(2) = (1/(2._wp*dy(k)))*( &
-                                                q_cons_vf(momxb + 1)%sf(j + q, k + 1, l)/rho_sf_small(1) - &
-                                                q_cons_vf(momxb + 1)%sf(j + q, k - 1, l)/rho_sf_small(-1))
+                                    if (num_fluids > 1) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_fluids - 1
+                                            alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                        end do
+                                    else
+                                        alpha_L(1) = 1._wp
+                                    end if
 
-                                if (q == 0) then
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_dims
-                                        dvel(i, 2) = dvel_small(i)
+                                        vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
                                     end do
-                                end if
-
-                                if (q > vidxb) then
-                                    vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
-                                    vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(2))/3._wp
-                                end if
-                                if (q < vidxe) then
-                                    vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
-                                    vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(2))/3._wp
-                                end if
+                                end do
 
-                                if (q == 0) then
-                                    jac_rhs(j, k, l) = real(alf_igr*(2._wp*(dvel(1, 2)*dvel(2, 1)) &
-                                                                     + dvel(1, 1)**2._wp + dvel(2, 2)**2._wp &
-                                                                     + (dvel(1, 1) + dvel(2, 2))**2._wp), kind=stp)
-                                end if
-                            end do
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb, vidxe - 1
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l)
+                                    end do
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_L(i) = 0._wp
-                                alpha_rho_R(i) = 0._wp
-                                alpha_L(i) = 0._wp
-                                alpha_R(i) = 0._wp
-                            end do
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = 0._wp
-                                vel_R(i) = 0._wp
-                            end do
+                                    if (num_fluids > 1) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_fluids - 1
+                                            alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                        end do
+                                    else
+                                        alpha_R(1) = 1._wp
+                                    end if
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb + 1, vidxe
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j + q, k, l)
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                    end do
                                 end do
 
                                 if (num_fluids > 1) then
+
+                                    alpha_L(num_fluids) = 1._wp
+                                    alpha_R(num_fluids) = 1._wp
+
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, num_fluids - 1
-                                        alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
+                                        alpha_L(num_fluids) = alpha_L(num_fluids) - alpha_L(i)
+                                        alpha_R(num_fluids) = alpha_R(num_fluids) - alpha_R(i)
                                     end do
-                                else
-                                    alpha_L(1) = 1._wp
                                 end if
 
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
-                                end do
-                            end do
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb, vidxe - 1
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j + q, k, l)
-                                end do
+                                    rho_L = rho_L + alpha_rho_L(i)
+                                    gamma_L = gamma_L + alpha_L(i)*gammas(i)
+                                    pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
 
-                                if (num_fluids > 1) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j + q, k, l)
-                                    end do
-                                else
-                                    alpha_R(1) = 1._wp
-                                end if
+                                    rho_R = rho_R + alpha_rho_R(i)
+                                    gamma_R = gamma_R + alpha_R(i)*gammas(i)
+                                    pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
+                                end do
 
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j + q, k, l)
+                                    vel_L(i) = vel_L(i)/rho_L
+                                    vel_R(i) = vel_R(i)/rho_R
                                 end do
-                            end do
 
-                            if (num_fluids > 1) then
+                                if (viscous) then
+                                    mu_L = 0._wp; mu_R = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        mu_L = alpha_L(i)/Res_igr(1, i) + mu_L
+                                        mu_R = alpha_R(i)/Res_igr(1, i) + mu_R
+                                    end do
 
-                                alpha_L(num_fluids) = 1._wp
-                                alpha_R(num_fluids) = 1._wp
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - &
+                                                                        real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)), kind=stp)
 
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    alpha_L(num_fluids) = alpha_L(num_fluids) - alpha_L(i)
-                                    alpha_R(num_fluids) = alpha_R(num_fluids) - alpha_R(i)
-                                end do
-                            end if
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dx(j)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)), kind=stp)
 
-                            rho_L = 0._wp; rho_R = 0._wp
-                            gamma_L = 0._wp; gamma_R = 0._wp
-                            pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - &
+                                                                        real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)), kind=stp)
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho_L = rho_L + alpha_rho_L(i)
-                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
-                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dx(j)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)), kind=stp)
 
-                                rho_R = rho_R + alpha_rho_R(i)
-                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
-                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
-                            end do
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)), kind=stp)
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = vel_L(i)/rho_L
-                                vel_R(i) = vel_R(i)/rho_R
-                            end do
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dx(j)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)), kind=stp)
+
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)), kind=stp)
+
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dx(j)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)), kind=stp)
+                                end if
+
+                                E_L = 0._wp; E_R = 0._wp
+
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb + 1, vidxe
+                                    E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                end do
 
-                            if (viscous) then
-                                mu_L = 0._wp; mu_R = 0._wp
                                 $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb, vidxe - 1
+                                    E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
+                                end do
+
+                                call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
+                                                          E_R, gamma_R, pi_inf_R, rho_R, vel_R, &
+                                                          pres_L, pres_R, cfl)
+
                                 do i = 1, num_fluids
-                                    mu_L = alpha_L(i)/Res_igr(1, i) + mu_L
-                                    mu_R = alpha_R(i)/Res_igr(1, i) + mu_R
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(alpha_rho_L(i)* &
+                                                                                 vel_L(1))*(1._wp/dx(j + 1)) - &
+                                                                      0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))), kind=stp)
+
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
+                                                            real((0.5_wp*dt*(alpha_rho_L(i)* &
+                                                                             vel_L(1))*(1._wp/dx(j)) - &
+                                                                  0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dx(j))), kind=stp)
                                 end do
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - &
-                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dx(j + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j + 1)), kind=stp)
+                                if (num_fluids > 1) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + &
+                                                                                real((0.5_wp*dt*(alpha_L(i)* &
+                                                                                                 vel_L(1))*(1._wp/dx(j + 1)) - &
+                                                                                      0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dx(j + 1))), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dx(j)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(2)*(1._wp/dx(j)), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) &
+                                                                                - real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) - &
-                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dx(j + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j + 1)), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
+                                                                            real((0.5_wp*dt*(alpha_L(i)* &
+                                                                                             vel_L(1))*(1._wp/dx(j)) - &
+                                                                                  0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dx(j))), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dx(j)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(2)*(1._wp/dx(j)), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
+                                                                            + real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))), kind=stp)
+                                    end do
+                                end if
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dx(j + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j + 1)), kind=stp)
+                                rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(rho_L*(vel_L(1))**2.0 + &
+                                                                                 pres_L)*(1._wp/dx(j + 1)) - &
+                                                                      0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))), kind=stp)
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dx(j)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(1)*(1._wp/dx(j)), kind=stp)
+                                rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + &
+                                                                    real((0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - &
+                                                                          0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))), kind=stp)
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dx(j + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j + 1)), kind=stp)
+                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(vel_L(1)*(E_L + &
+                                                                                           pres_L))*(1._wp/dx(j + 1)) - &
+                                                                      0.5_wp*dt*cfl*(E_L)*(1._wp/dx(j + 1))), kind=stp)
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dx(j)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(1)*(1._wp/dx(j)), kind=stp)
-                            end if
-
-                            E_L = 0._wp; E_R = 0._wp
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb + 1, vidxe
-                                E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
-                            end do
-
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb, vidxe - 1
-                                E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j + q, k, l)
-                            end do
-
-                            call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
-                                                      E_R, gamma_R, pi_inf_R, rho_R, vel_R, &
-                                                      pres_L, pres_R, cfl)
+                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
+                                                            real((0.5_wp*dt*(rho_L*(vel_L(1))**2.0 + &
+                                                                             pres_L)*(1._wp/dx(j)) - &
+                                                                  0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))), kind=stp)
 
-                            do i = 1, num_fluids
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + &
-                                                            real((0.5_wp*dt*(alpha_rho_L(i)* &
-                                                                             vel_L(1))*(1._wp/dx(j + 1)) - &
-                                                                  0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dx(j + 1))), kind=stp)
+                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
+                                                                real((0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - &
+                                                                      0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))), kind=stp)
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(alpha_rho_L(i)* &
-                                                                         vel_L(1))*(1._wp/dx(j)) - &
-                                                              0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dx(j))), kind=stp)
-                            end do
+                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
+                                                            real((0.5_wp*dt*(vel_L(1)*(E_L + &
+                                                                                       pres_L))*(1._wp/dx(j)) - &
+                                                                  0.5_wp*dt*cfl*(E_L)*(1._wp/dx(j))), kind=stp)
 
-                            if (num_fluids > 1) then
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + &
-                                                                            real((0.5_wp*dt*(alpha_L(i)* &
-                                                                                             vel_L(1))*(1._wp/dx(j + 1)) - &
-                                                                                  0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dx(j + 1))), kind=stp)
-
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) &
-                                                                            - real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_L(1)*(1._wp/dx(j + 1))), kind=stp)
-
+                                do i = 1, num_fluids
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
-                                                                        real((0.5_wp*dt*(alpha_L(i)* &
-                                                                                         vel_L(1))*(1._wp/dx(j)) - &
-                                                                              0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dx(j))), kind=stp)
+                                    rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(alpha_rho_R(i)* &
+                                                                                 vel_R(1))*(1._wp/dx(j + 1)) + &
+                                                                      0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))), kind=stp)
 
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
-                                                                        + real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(1)*(1._wp/dx(j))), kind=stp)
+                                    rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
+                                                            real((0.5_wp*dt*(alpha_rho_R(i)* &
+                                                                             vel_R(1))*(1._wp/dx(j)) + &
+                                                                  0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dx(j))), kind=stp)
                                 end do
-                            end if
-
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + &
-                                                            real((0.5_wp*dt*(rho_L*(vel_L(1))**2.0 + &
-                                                                             pres_L)*(1._wp/dx(j + 1)) - &
-                                                                  0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dx(j + 1))), kind=stp)
-
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + &
-                                                                real((0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j + 1)) - &
-                                                                      0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dx(j + 1))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + &
-                                                            real((0.5_wp*dt*(vel_L(1)*(E_L + &
-                                                                                       pres_L))*(1._wp/dx(j + 1)) - &
-                                                                  0.5_wp*dt*cfl*(E_L)*(1._wp/dx(j + 1))), kind=stp)
+                                if (num_fluids > 1) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + &
+                                                                                real((0.5_wp*dt*(alpha_R(i)* &
+                                                                                                 vel_R(1))*(1._wp/dx(j + 1)) + &
+                                                                                      0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dx(j + 1))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(rho_L*(vel_L(1))**2.0 + &
-                                                                         pres_L)*(1._wp/dx(j)) - &
-                                                              0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dx(j))), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) &
+                                                                                - real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
-                                                            real((0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dx(j)) - &
-                                                                  0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dx(j))), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
+                                                                            real((0.5_wp*dt*(alpha_R(i)* &
+                                                                                             vel_R(1))*(1._wp/dx(j)) + &
+                                                                                  0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dx(j))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(vel_L(1)*(E_L + &
-                                                                                   pres_L))*(1._wp/dx(j)) - &
-                                                              0.5_wp*dt*cfl*(E_L)*(1._wp/dx(j))), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
+                                                                            + real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))), kind=stp)
+                                    end do
+                                end if
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j + 1, k, l) = rhs_vf(i)%sf(j + 1, k, l) + &
-                                                            real((0.5_wp*dt*(alpha_rho_R(i)* &
-                                                                             vel_R(1))*(1._wp/dx(j + 1)) + &
-                                                                  0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dx(j + 1))), kind=stp)
+                                rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(rho_R*(vel_R(1))**2.0 + &
+                                                                                 pres_R)*(1._wp/dx(j + 1)) + &
+                                                                      0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))), kind=stp)
 
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(alpha_rho_R(i)* &
-                                                                         vel_R(1))*(1._wp/dx(j)) + &
-                                                              0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dx(j))), kind=stp)
-                            end do
-
-                            if (num_fluids > 1) then
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) + &
-                                                                            real((0.5_wp*dt*(alpha_R(i)* &
-                                                                                             vel_R(1))*(1._wp/dx(j + 1)) + &
-                                                                                  0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dx(j + 1))), kind=stp)
-
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j + 1, k, l) = rhs_vf(advxb + i - 1)%sf(j + 1, k, l) &
-                                                                            - real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j + 1, k, l)*vel_R(1)*(1._wp/dx(j + 1))), kind=stp)
-
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
-                                                                        real((0.5_wp*dt*(alpha_R(i)* &
-                                                                                         vel_R(1))*(1._wp/dx(j)) + &
-                                                                              0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dx(j))), kind=stp)
+                                rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + &
+                                                                    real((0.5_wp*dt*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + &
+                                                                          0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))), kind=stp)
 
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
-                                                                        + real((0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(1)*(1._wp/dx(j))), kind=stp)
-                                end do
-                            end if
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + &
+                                                                real((0.5_wp*dt*(vel_R(1)*(E_R + &
+                                                                                           pres_R))*(1._wp/dx(j + 1)) + &
+                                                                      0.5_wp*dt*cfl*(E_R)*(1._wp/dx(j + 1))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j + 1, k, l) = rhs_vf(momxb)%sf(j + 1, k, l) + &
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
                                                             real((0.5_wp*dt*(rho_R*(vel_R(1))**2.0 + &
-                                                                             pres_R)*(1._wp/dx(j + 1)) + &
-                                                                  0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dx(j + 1))), kind=stp)
+                                                                             pres_R)*(1._wp/dx(j)) + &
+                                                                  0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j + 1, k, l) = rhs_vf(momxb + 1)%sf(j + 1, k, l) + &
-                                                                real((0.5_wp*dt*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j + 1)) + &
-                                                                      0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dx(j + 1))), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
+                                                                real((0.5_wp*dt*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + &
+                                                                      0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j + 1, k, l) = rhs_vf(E_idx)%sf(j + 1, k, l) + &
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
                                                             real((0.5_wp*dt*(vel_R(1)*(E_R + &
-                                                                                       pres_R))*(1._wp/dx(j + 1)) + &
-                                                                  0.5_wp*dt*cfl*(E_R)*(1._wp/dx(j + 1))), kind=stp)
-
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(rho_R*(vel_R(1))**2.0 + &
-                                                                         pres_R)*(1._wp/dx(j)) + &
-                                                              0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dx(j))), kind=stp)
-
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
-                                                            real((0.5_wp*dt*rho_R*vel_R(1)*vel_R(2)*(1._wp/dx(j)) + &
-                                                                  0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dx(j))), kind=stp)
-
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
-                                                        real((0.5_wp*dt*(vel_R(1)*(E_R + &
-                                                                                   pres_R))*(1._wp/dx(j)) + &
-                                                              0.5_wp*dt*cfl*(E_R)*(1._wp/dx(j))), kind=stp)
+                                                                                       pres_R))*(1._wp/dx(j)) + &
+                                                                  0.5_wp*dt*cfl*(E_R)*(1._wp/dx(j))), kind=stp)
 
+                            end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
                 #:endif
             else
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
@@ -1415,400 +1414,400 @@ contains
         else if (idir == 2) then
             if (p == 0) then
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
-                do l = 0, p
-                    do k = -1, n
-                        do j = 0, m
+                    $:GPU_PARALLEL_LOOP(collapse=3, private='[j,k,l,rho_L, rho_R, gamma_L, gamma_R, pi_inf_L, pi_inf_R, mu_L, mu_R, vel_L, vel_R, pres_L, pres_R, alpha_L, alpha_R, alpha_rho_L, alpha_rho_R, F_L, F_R, E_L, E_R, cfl, dvel_small, rho_sf_small, vflux_L_arr, vflux_R_arr]')
+                    do l = 0, p
+                        do k = -1, n
+                            do j = 0, m
 
-                            if (viscous) then
-                                vflux_L_arr = 0._wp
-                                vflux_R_arr = 0._wp
+                                if (viscous) then
+                                    vflux_L_arr = 0._wp
+                                    vflux_R_arr = 0._wp
 
-                                #:if MFC_CASE_OPTIMIZATION
-                                    #:if igr_order == 5
-                                        !DIR$ unroll 6
-                                    #:elif igr_order == 3
-                                        !DIR$ unroll 4
-                                    #:endif
-                                #:endif
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do q = vidxb, vidxe
-
-                                    !x-direction contributions
+                                    #:if MFC_CASE_OPTIMIZATION
+                                        #:if igr_order == 5
+                                            !DIR$ unroll 6
+                                        #:elif igr_order == 3
+                                            !DIR$ unroll 4
+                                        #:endif
+                                    #:endif
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do i = -1, 1
-                                        rho_L = 0._wp
+                                    do q = vidxb, vidxe
+
+                                        !x-direction contributions
                                         $:GPU_LOOP(parallelism='[seq]')
-                                        do r = 1, num_fluids
-                                            rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l)
+                                        do i = -1, 1
+                                            rho_L = 0._wp
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do r = 1, num_fluids
+                                                rho_L = rho_L + q_cons_vf(r)%sf(j + i, k + q, l)
+                                            end do
+                                            rho_sf_small(i) = rho_L
                                         end do
-                                        rho_sf_small(i) = rho_L
-                                    end do
 
-                                    dvel_small(1) = (1/(2._wp*dx(j)))*( &
-                                                    q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - &
-                                                    q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1))
-                                    dvel_small(2) = (1/(2._wp*dx(j)))*( &
-                                                    q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - &
-                                                    q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1))
+                                        dvel_small(1) = (1/(2._wp*dx(j)))*( &
+                                                        q_cons_vf(momxb)%sf(j + 1, k + q, l)/rho_sf_small(1) - &
+                                                        q_cons_vf(momxb)%sf(j - 1, k + q, l)/rho_sf_small(-1))
+                                        dvel_small(2) = (1/(2._wp*dx(j)))*( &
+                                                        q_cons_vf(momxb + 1)%sf(j + 1, k + q, l)/rho_sf_small(1) - &
+                                                        q_cons_vf(momxb + 1)%sf(j - 1, k + q, l)/rho_sf_small(-1))
 
-                                    if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp
-                                    end if
-                                    if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp
-                                    end if
+                                        if (q > vidxb) then
+                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(2))
+                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(-2._wp*dvel_small(1))/3._wp
+                                        end if
+                                        if (q < vidxe) then
+                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(2))
+                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(-2._wp*dvel_small(1))/3._wp
+                                        end if
 
-                                    !y-direction contributions
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = -1, 1
-                                        rho_L = 0._wp
+                                        !y-direction contributions
                                         $:GPU_LOOP(parallelism='[seq]')
-                                        do r = 1, num_fluids
-                                            rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l)
+                                        do i = -1, 1
+                                            rho_L = 0._wp
+                                            $:GPU_LOOP(parallelism='[seq]')
+                                            do r = 1, num_fluids
+                                                rho_L = rho_L + q_cons_vf(r)%sf(j, k + i + q, l)
+                                            end do
+                                            rho_sf_small(i) = rho_L
                                         end do
-                                        rho_sf_small(i) = rho_L
-                                    end do
 
-                                    dvel_small(1) = (1/(2._wp*dy(k)))*( &
-                                                    q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - &
-                                                    q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
-                                    dvel_small(2) = (1/(2._wp*dy(k)))*( &
-                                                    q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - &
-                                                    q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
-
-                                    if (q > vidxb) then
-                                        vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
-                                        vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp
-                                    end if
-                                    if (q < vidxe) then
-                                        vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
-                                        vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp
-                                    end if
-                                end do
-                            end if
+                                        dvel_small(1) = (1/(2._wp*dy(k)))*( &
+                                                        q_cons_vf(momxb)%sf(j, k + 1 + q, l)/rho_sf_small(1) - &
+                                                        q_cons_vf(momxb)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
+                                        dvel_small(2) = (1/(2._wp*dy(k)))*( &
+                                                        q_cons_vf(momxb + 1)%sf(j, k + 1 + q, l)/rho_sf_small(1) - &
+                                                        q_cons_vf(momxb + 1)%sf(j, k - 1 + q, l)/rho_sf_small(-1))
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                alpha_rho_L(i) = 0._wp
-                                alpha_rho_R(i) = 0._wp
-                                alpha_L(i) = 0._wp
-                                alpha_R(i) = 0._wp
-                            end do
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = 0._wp
-                                vel_R(i) = 0._wp
-                            end do
+                                        if (q > vidxb) then
+                                            vflux_L_arr(1) = vflux_L_arr(1) + coeff_L(q)*(dvel_small(1))
+                                            vflux_L_arr(3) = vflux_L_arr(3) + coeff_L(q)*(4._wp*dvel_small(2))/3._wp
+                                        end if
+                                        if (q < vidxe) then
+                                            vflux_R_arr(1) = vflux_R_arr(1) + coeff_R(q)*(dvel_small(1))
+                                            vflux_R_arr(3) = vflux_R_arr(3) + coeff_R(q)*(4._wp*dvel_small(2))/3._wp
+                                        end if
+                                    end do
+                                end if
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb + 1, vidxe
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l)
+                                    alpha_rho_L(i) = 0._wp
+                                    alpha_rho_R(i) = 0._wp
+                                    alpha_L(i) = 0._wp
+                                    alpha_R(i) = 0._wp
                                 end do
-
-                                if (num_fluids > 1) then
-                                    $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
-                                    end do
-                                else
-                                    alpha_L(1) = 1._wp
-                                end if
-
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_dims
-                                    vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    vel_L(i) = 0._wp
+                                    vel_R(i) = 0._wp
                                 end do
-                            end do
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb, vidxe - 1
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids
-                                    alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l)
-                                end do
-
-                                if (num_fluids > 1) then
+                                do q = vidxb + 1, vidxe
                                     $:GPU_LOOP(parallelism='[seq]')
-                                    do i = 1, num_fluids - 1
-                                        alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                    do i = 1, num_fluids
+                                        alpha_rho_L(i) = alpha_rho_L(i) + coeff_L(q)*q_cons_vf(i)%sf(j, k + q, l)
                                     end do
-                                else
-                                    alpha_R(1) = 1._wp
-                                end if
 
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_dims
-                                    vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    if (num_fluids > 1) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_fluids - 1
+                                            alpha_L(i) = alpha_L(i) + coeff_L(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                        end do
+                                    else
+                                        alpha_L(1) = 1._wp
+                                    end if
+
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        vel_L(i) = vel_L(i) + coeff_L(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    end do
                                 end do
-                            end do
 
-                            if (num_fluids > 1) then
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb, vidxe - 1
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        alpha_rho_R(i) = alpha_rho_R(i) + coeff_R(q)*q_cons_vf(i)%sf(j, k + q, l)
+                                    end do
 
-                                alpha_L(num_fluids) = 1._wp
-                                alpha_R(num_fluids) = 1._wp
+                                    if (num_fluids > 1) then
+                                        $:GPU_LOOP(parallelism='[seq]')
+                                        do i = 1, num_fluids - 1
+                                            alpha_R(i) = alpha_R(i) + coeff_R(q)*q_cons_vf(E_idx + i)%sf(j, k + q, l)
+                                        end do
+                                    else
+                                        alpha_R(1) = 1._wp
+                                    end if
 
-                                $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
-                                    alpha_L(num_fluids) = alpha_L(num_fluids) - alpha_L(i)
-                                    alpha_R(num_fluids) = alpha_R(num_fluids) - alpha_R(i)
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_dims
+                                        vel_R(i) = vel_R(i) + coeff_R(q)*q_cons_vf(momxb + i - 1)%sf(j, k + q, l)
+                                    end do
                                 end do
-                            end if
 
-                            rho_L = 0._wp; rho_R = 0._wp
-                            gamma_L = 0._wp; gamma_R = 0._wp
-                            pi_inf_L = 0._wp; pi_inf_R = 0._wp
+                                if (num_fluids > 1) then
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                rho_L = rho_L + alpha_rho_L(i)
-                                gamma_L = gamma_L + alpha_L(i)*gammas(i)
-                                pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+                                    alpha_L(num_fluids) = 1._wp
+                                    alpha_R(num_fluids) = 1._wp
 
-                                rho_R = rho_R + alpha_rho_R(i)
-                                gamma_R = gamma_R + alpha_R(i)*gammas(i)
-                                pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
-                            end do
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        alpha_L(num_fluids) = alpha_L(num_fluids) - alpha_L(i)
+                                        alpha_R(num_fluids) = alpha_R(num_fluids) - alpha_R(i)
+                                    end do
+                                end if
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_dims
-                                vel_L(i) = vel_L(i)/rho_L
-                                vel_R(i) = vel_R(i)/rho_R
-                            end do
+                                rho_L = 0._wp; rho_R = 0._wp
+                                gamma_L = 0._wp; gamma_R = 0._wp
+                                pi_inf_L = 0._wp; pi_inf_R = 0._wp
 
-                            if (viscous) then
-                                mu_L = 0._wp
-                                mu_R = 0._wp
                                 $:GPU_LOOP(parallelism='[seq]')
                                 do i = 1, num_fluids
-                                    mu_L = alpha_L(i)/Res_igr(1, i) + mu_L
-                                    mu_R = alpha_R(i)/Res_igr(1, i) + mu_R
+                                    rho_L = rho_L + alpha_rho_L(i)
+                                    gamma_L = gamma_L + alpha_L(i)*gammas(i)
+                                    pi_inf_L = pi_inf_L + alpha_L(i)*pi_infs(i)
+
+                                    rho_R = rho_R + alpha_rho_R(i)
+                                    gamma_R = gamma_R + alpha_R(i)*gammas(i)
+                                    pi_inf_R = pi_inf_R + alpha_R(i)*pi_infs(i)
                                 end do
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do i = 1, num_dims
+                                    vel_L(i) = vel_L(i)/rho_L
+                                    vel_R(i) = vel_R(i)/rho_R
+                                end do
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dy(k)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)), kind=stp)
+                                if (viscous) then
+                                    mu_L = 0._wp
+                                    mu_R = 0._wp
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids
+                                        mu_L = alpha_L(i)/Res_igr(1, i) + mu_L
+                                        mu_R = alpha_R(i)/Res_igr(1, i) + mu_R
+                                    end do
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k + 1)), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dy(k)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(1)*vel_L(1)*(1._wp/dy(k)), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - &
-                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k + 1)), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
-                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dy(k)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(1)*vel_R(1)*(1._wp/dy(k)), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - &
-                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - &
+                                                                        real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k + 1)), kind=stp)
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
-                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dy(k)), kind=stp)
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
-                                                            real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)), kind=stp)
-                            end if
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
+                                                                    real(0.5_wp*dt*mu_L*vflux_L_arr(3)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_L*vflux_L_arr(3)*vel_L(2)*(1._wp/dy(k)), kind=stp)
 
-                            E_L = 0._wp; E_R = 0._wp
-                            F_L = 0._wp; F_R = 0._wp
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) - &
+                                                                        real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dy(k + 1)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) - &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k + 1)), kind=stp)
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb + 1, vidxe
-                                E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                F_L = F_L + coeff_L(q)*jac(j, k + q, l)
-                            end do
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) + &
+                                                                    real(0.5_wp*dt*mu_R*vflux_R_arr(3)*(1._wp/dy(k)), kind=stp)
+                                    $:GPU_ATOMIC(atomic='update')
+                                    rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) + &
+                                                                real(0.5_wp*dt*mu_R*vflux_R_arr(3)*vel_R(2)*(1._wp/dy(k)), kind=stp)
+                                end if
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do q = vidxb, vidxe - 1
-                                E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
-                                F_R = F_R + coeff_R(q)*jac(j, k + q, l)
-                            end do
+                                E_L = 0._wp; E_R = 0._wp
+                                F_L = 0._wp; F_R = 0._wp
 
-                            call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
-                                                      E_R, gamma_R, pi_inf_R, rho_R, vel_R, &
-                                                      pres_L, pres_R, cfl)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb + 1, vidxe
+                                    E_L = E_L + coeff_L(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                    F_L = F_L + coeff_L(q)*jac(j, k + q, l)
+                                end do
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + &
-                                                            real(0.5_wp*dt*(alpha_rho_L(i)* &
-                                                                            vel_L(2))*(1._wp/dy(k + 1)) - &
-                                                                 0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_LOOP(parallelism='[seq]')
+                                do q = vidxb, vidxe - 1
+                                    E_R = E_R + coeff_R(q)*q_cons_vf(E_idx)%sf(j, k + q, l)
+                                    F_R = F_R + coeff_R(q)*jac(j, k + q, l)
+                                end do
 
-                                $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*(alpha_rho_L(i)* &
-                                                                        vel_L(2))*(1._wp/dy(k)) - &
-                                                             0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dy(k)), kind=stp)
-                            end do
+                                call s_get_derived_states(E_L, gamma_L, pi_inf_L, rho_L, vel_L, &
+                                                          E_R, gamma_R, pi_inf_R, rho_R, vel_R, &
+                                                          pres_L, pres_R, cfl)
 
-                            if (num_fluids > 1) then
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
+                                do i = 1, num_fluids
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + &
-                                                                            real(0.5_wp*dt*(alpha_L(i)* &
-                                                                                            vel_L(2))*(1._wp/dy(k + 1)) - &
-                                                                                 0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dy(k + 1)), kind=stp)
+                                    rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*(alpha_rho_L(i)* &
+                                                                                vel_L(2))*(1._wp/dy(k + 1)) - &
+                                                                     0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dy(k + 1)), kind=stp)
 
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) &
-                                                                            - real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1)), kind=stp)
+                                    rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
+                                                            real(0.5_wp*dt*(alpha_rho_L(i)* &
+                                                                            vel_L(2))*(1._wp/dy(k)) - &
+                                                                 0.5_wp*dt*cfl*(alpha_rho_L(i))*(1._wp/dy(k)), kind=stp)
+                                end do
 
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
-                                                                        real(0.5_wp*dt*(alpha_L(i)* &
-                                                                                        vel_L(2))*(1._wp/dy(k)) - &
-                                                                             0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dy(k)), kind=stp)
+                                if (num_fluids > 1) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + &
+                                                                                real(0.5_wp*dt*(alpha_L(i)* &
+                                                                                                vel_L(2))*(1._wp/dy(k + 1)) - &
+                                                                                     0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dy(k + 1)), kind=stp)
 
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
-                                                                        + real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k)), kind=stp)
-                                end do
-                            end if
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) &
+                                                                                - real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_L(2)*(1._wp/dy(k + 1)), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + &
-                                                                real(0.5_wp*dt*(rho_L*(vel_L(2))**2.0 + &
-                                                                                pres_L + F_L)*(1._wp/dy(k + 1)) - &
-                                                                     0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1)), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
+                                                                            real(0.5_wp*dt*(alpha_L(i)* &
+                                                                                            vel_L(2))*(1._wp/dy(k)) - &
+                                                                                 0.5_wp*dt*cfl*(alpha_L(i))*(1._wp/dy(k)), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + &
-                                                            real(0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - &
-                                                                 0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1)), kind=stp)
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
+                                                                            + real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_L(2)*(1._wp/dy(k)), kind=stp)
+                                    end do
+                                end if
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + &
-                                                            real(0.5_wp*dt*(vel_L(2)*(E_L + &
-                                                                                      pres_L + F_L))*(1._wp/dy(k + 1)) - &
-                                                                 0.5_wp*dt*cfl*(E_L)*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + &
+                                                                    real(0.5_wp*dt*(rho_L*(vel_L(2))**2.0 + &
+                                                                                    pres_L + F_L)*(1._wp/dy(k + 1)) - &
+                                                                         0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dy(k + 1)), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
-                                                            real(0.5_wp*dt*(rho_L*(vel_L(2))**2.0 + &
-                                                                            pres_L + F_L)*(1._wp/dy(k)) - &
-                                                                 0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dy(k)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k + 1)) - &
+                                                                     0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dy(k + 1)), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - &
-                                                             0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dy(k)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*(vel_L(2)*(E_L + &
+                                                                                          pres_L + F_L))*(1._wp/dy(k + 1)) - &
+                                                                     0.5_wp*dt*cfl*(E_L)*(1._wp/dy(k + 1)), kind=stp)
 
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*(vel_L(2)*(E_L + &
-                                                                                  pres_L + F_L))*(1._wp/dy(k)) - &
-                                                             0.5_wp*dt*cfl*(E_L)*(1._wp/dy(k)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
+                                                                real(0.5_wp*dt*(rho_L*(vel_L(2))**2.0 + &
+                                                                                pres_L + F_L)*(1._wp/dy(k)) - &
+                                                                     0.5_wp*dt*cfl*(rho_L*vel_L(2))*(1._wp/dy(k)), kind=stp)
 
-                            $:GPU_LOOP(parallelism='[seq]')
-                            do i = 1, num_fluids
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + &
-                                                            real(0.5_wp*dt*(alpha_rho_R(i)* &
-                                                                            vel_R(2))*(1._wp/dy(k + 1)) + &
-                                                                 0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1)), kind=stp)
+                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
+                                                            real(0.5_wp*dt*rho_L*vel_L(1)*vel_L(2)*(1._wp/dy(k)) - &
+                                                                 0.5_wp*dt*cfl*(rho_L*vel_L(1))*(1._wp/dy(k)), kind=stp)
+
                                 $:GPU_ATOMIC(atomic='update')
-                                rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*(alpha_rho_R(i)* &
-                                                                        vel_R(2))*(1._wp/dy(k)) + &
-                                                             0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dy(k)), kind=stp)
-                            end do
+                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
+                                                            real(0.5_wp*dt*(vel_L(2)*(E_L + &
+                                                                                      pres_L + F_L))*(1._wp/dy(k)) - &
+                                                                 0.5_wp*dt*cfl*(E_L)*(1._wp/dy(k)), kind=stp)
 
-                            if (num_fluids > 1) then
                                 $:GPU_LOOP(parallelism='[seq]')
-                                do i = 1, num_fluids - 1
+                                do i = 1, num_fluids
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + &
-                                                                            real(0.5_wp*dt*(alpha_R(i)* &
-                                                                                            vel_R(2))*(1._wp/dy(k + 1)) + &
-                                                                                 0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dy(k + 1)), kind=stp)
-
+                                    rhs_vf(i)%sf(j, k + 1, l) = rhs_vf(i)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*(alpha_rho_R(i)* &
+                                                                                vel_R(2))*(1._wp/dy(k + 1)) + &
+                                                                     0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dy(k + 1)), kind=stp)
                                     $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) &
-                                                                            - real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1)), kind=stp)
+                                    rhs_vf(i)%sf(j, k, l) = rhs_vf(i)%sf(j, k, l) - &
+                                                            real(0.5_wp*dt*(alpha_rho_R(i)* &
+                                                                            vel_R(2))*(1._wp/dy(k)) + &
+                                                                 0.5_wp*dt*cfl*(alpha_rho_R(i))*(1._wp/dy(k)), kind=stp)
+                                end do
 
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
-                                                                        real(0.5_wp*dt*(alpha_R(i)* &
-                                                                                        vel_R(2))*(1._wp/dy(k)) + &
-                                                                             0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dy(k)), kind=stp)
+                                if (num_fluids > 1) then
+                                    $:GPU_LOOP(parallelism='[seq]')
+                                    do i = 1, num_fluids - 1
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) + &
+                                                                                real(0.5_wp*dt*(alpha_R(i)* &
+                                                                                                vel_R(2))*(1._wp/dy(k + 1)) + &
+                                                                                     0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dy(k + 1)), kind=stp)
 
-                                    $:GPU_ATOMIC(atomic='update')
-                                    rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
-                                                                        + real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k)), kind=stp)
-                                end do
-                            end if
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + &
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k + 1, l) = rhs_vf(advxb + i - 1)%sf(j, k + 1, l) &
+                                                                                - real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k + 1, l)*vel_R(2)*(1._wp/dy(k + 1)), kind=stp)
+
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) - &
+                                                                            real(0.5_wp*dt*(alpha_R(i)* &
+                                                                                            vel_R(2))*(1._wp/dy(k)) + &
+                                                                                 0.5_wp*dt*cfl*(alpha_R(i))*(1._wp/dy(k)), kind=stp)
+
+                                        $:GPU_ATOMIC(atomic='update')
+                                        rhs_vf(advxb + i - 1)%sf(j, k, l) = rhs_vf(advxb + i - 1)%sf(j, k, l) &
+                                                                            + real(0.5_wp*dt*q_cons_vf(advxb + i - 1)%sf(j, k, l)*vel_R(2)*(1._wp/dy(k)), kind=stp)
+                                    end do
+                                end if
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb + 1)%sf(j, k + 1, l) = rhs_vf(momxb + 1)%sf(j, k + 1, l) + &
+                                                                    real(0.5_wp*dt*(rho_R*(vel_R(2))**2.0 + &
+                                                                                    pres_R + F_R)*(1._wp/dy(k + 1)) + &
+                                                                         0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + &
+                                                                     0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + &
+                                                                real(0.5_wp*dt*(vel_R(2)*(E_R + &
+                                                                                          pres_R + F_R))*(1._wp/dy(k + 1)) + &
+                                                                     0.5_wp*dt*cfl*(E_R)*(1._wp/dy(k + 1)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
                                                                 real(0.5_wp*dt*(rho_R*(vel_R(2))**2.0 + &
-                                                                                pres_R + F_R)*(1._wp/dy(k + 1)) + &
-                                                                     0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dy(k + 1)), kind=stp)
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k + 1, l) = rhs_vf(momxb)%sf(j, k + 1, l) + &
-                                                            real(0.5_wp*dt*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k + 1)) + &
-                                                                 0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dy(k + 1)), kind=stp)
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k + 1, l) = rhs_vf(E_idx)%sf(j, k + 1, l) + &
+                                                                                pres_R + F_R)*(1._wp/dy(k)) + &
+                                                                     0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dy(k)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
+                                                            real(0.5_wp*dt*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + &
+                                                                 0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dy(k)), kind=stp)
+                                $:GPU_ATOMIC(atomic='update')
+                                rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
                                                             real(0.5_wp*dt*(vel_R(2)*(E_R + &
-                                                                                      pres_R + F_R))*(1._wp/dy(k + 1)) + &
-                                                                 0.5_wp*dt*cfl*(E_R)*(1._wp/dy(k + 1)), kind=stp)
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb + 1)%sf(j, k, l) = rhs_vf(momxb + 1)%sf(j, k, l) - &
-                                                            real(0.5_wp*dt*(rho_R*(vel_R(2))**2.0 + &
-                                                                            pres_R + F_R)*(1._wp/dy(k)) + &
-                                                                 0.5_wp*dt*cfl*(rho_R*vel_R(2))*(1._wp/dy(k)), kind=stp)
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(momxb)%sf(j, k, l) = rhs_vf(momxb)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*rho_R*vel_R(2)*vel_R(1)*(1._wp/dy(k)) + &
-                                                             0.5_wp*dt*cfl*(rho_R*vel_R(1))*(1._wp/dy(k)), kind=stp)
-                            $:GPU_ATOMIC(atomic='update')
-                            rhs_vf(E_idx)%sf(j, k, l) = rhs_vf(E_idx)%sf(j, k, l) - &
-                                                        real(0.5_wp*dt*(vel_R(2)*(E_R + &
-                                                                                  pres_R + F_R))*(1._wp/dy(k)) + &
-                                                             0.5_wp*dt*cfl*(E_R)*(1._wp/dy(k)), kind=stp)
+                                                                                      pres_R + F_R))*(1._wp/dy(k)) + &
+                                                                 0.5_wp*dt*cfl*(E_R)*(1._wp/dy(k)), kind=stp)
+                            end do
                         end do
                     end do
-                end do
-                $:END_GPU_PARALLEL_LOOP()
+                    $:END_GPU_PARALLEL_LOOP()
                 #:endif
             else
                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
diff --git a/src/simulation/m_pressure_relaxation.fpp b/src/simulation/m_pressure_relaxation.fpp
index d87eab8898..b94871044c 100644
--- a/src/simulation/m_pressure_relaxation.fpp
+++ b/src/simulation/m_pressure_relaxation.fpp
@@ -146,11 +146,11 @@ contains
         integer, intent(in) :: j, k, l
 
         real(wp) :: pres_relax, f_pres, df_pres
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(3) :: pres_K_init, rho_K_s
-#:else 
-        real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: pres_K_init, rho_K_s
+        #:else
+            real(wp), dimension(num_fluids) :: pres_K_init, rho_K_s
+        #:endif
         integer, parameter :: MAX_ITER = 50
         real(wp), parameter :: TOLERANCE = 1.e-10_wp
         integer :: iter, i
@@ -217,11 +217,11 @@ contains
 
         type(scalar_field), dimension(sys_size), intent(inout) :: q_cons_vf
         integer, intent(in) :: j, k, l
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(2) :: alpha_rho, alpha
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_rho, alpha
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(2) :: alpha_rho, alpha
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho, alpha
+        #:endif
         real(wp) :: rho, dyn_pres, gamma, pi_inf, pres_relax, sum_alpha
         real(wp), dimension(2) :: Re
         integer :: i, q
diff --git a/src/simulation/m_qbmm.fpp b/src/simulation/m_qbmm.fpp
index 4eae61eec9..84f95af557 100644
--- a/src/simulation/m_qbmm.fpp
+++ b/src/simulation/m_qbmm.fpp
@@ -571,11 +571,11 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-#:if USING_AMD
-        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
-#:else
-        real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
+        #:else
+            real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
+        #:endif
 
         integer :: i1, i2
 
@@ -650,11 +650,11 @@ contains
             & cray_inline=True)
 
         real(wp), intent(in) :: pres, rho, c
-#:if USING_AMD
-        real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
-#:else
-        real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
-#:endif
+        #:if USING_AMD
+            real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeffs
+        #:else
+            real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeffs
+        #:endif
 
         integer :: i1, i2
 
@@ -723,18 +723,18 @@ contains
         real(stp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: mv
         real(wp), dimension(idwbuff(1)%beg:, idwbuff(2)%beg:, idwbuff(3)%beg:, 1:, 1:), intent(inout) :: rhs_mv
         type(int_bounds_info), intent(in) :: ix, iy, iz
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(6) :: moms, msum
-        real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
-#:else 
-        real(wp), dimension(nmom) :: moms, msum
-        real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
-#:endif
-#:if USING_AMD
-        real(wp), dimension(32, 0:2, 0:2) :: coeff
-#:else
-        real(wp), dimension(nterms, 0:2, 0:2) :: coeff
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(6) :: moms, msum
+            real(wp), dimension(4, 3) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
+        #:else
+            real(wp), dimension(nmom) :: moms, msum
+            real(wp), dimension(nnode, nb) :: wght, abscX, abscY, wght_pb, wght_mv, wght_ht, ht
+        #:endif
+        #:if USING_AMD
+            real(wp), dimension(32, 0:2, 0:2) :: coeff
+        #:else
+            real(wp), dimension(nterms, 0:2, 0:2) :: coeff
+        #:endif
         real(wp) :: pres, rho, nbub, c, alf, momsum, drdt, drdt2, chi_vw, x_vw, rho_mw, k_mw, grad_T
         real(wp) :: n_tait, B_tait
         integer :: id1, id2, id3, i1, i2, j, q, r
@@ -887,11 +887,11 @@ contains
             $:GPU_ROUTINE(function_name='s_coeff_selector',parallelism='[seq]', &
                 & cray_inline=True)
             real(wp), intent(in) :: pres, rho, c
-#:if USING_AMD
-            real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
-#:else
-            real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
-#:endif
+            #:if USING_AMD
+                real(wp), dimension(32, 0:2, 0:2), intent(out) :: coeff
+            #:else
+                real(wp), dimension(nterms, 0:2, 0:2), intent(out) :: coeff
+            #:endif
             logical, intent(in) :: polytropic
             if (polytropic) then
                 call s_coeff(pres, rho, c, coeff)
@@ -980,11 +980,11 @@ contains
 
         function f_quad(abscX, abscY, wght_in, q, r, s)
             $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
-#:else
-            real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in 
-#:endif
+            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+                real(wp), dimension(4, 3), intent(in) :: abscX, abscY, wght_in
+            #:else
+                real(wp), dimension(nnode, nb), intent(in) :: abscX, abscY, wght_in
+            #:endif
             real(wp), intent(in) :: q, r, s
 
             real(wp) :: f_quad_RV, f_quad
@@ -1005,11 +1005,11 @@ contains
 
         function f_quad2D(abscX, abscY, wght_in, pow)
             $:GPU_ROUTINE(parallelism='[seq]')
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-            real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
-#:else 
-            real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
-#:endif
+            #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+                real(wp), dimension(4), intent(in) :: abscX, abscY, wght_in
+            #:else
+                real(wp), dimension(nnode), intent(in) :: abscX, abscY, wght_in
+            #:endif
             real(wp), dimension(3), intent(in) :: pow
 
             real(wp) :: f_quad2D
diff --git a/src/simulation/m_rhs.fpp b/src/simulation/m_rhs.fpp
index 9689be4db0..e5788b4215 100644
--- a/src/simulation/m_rhs.fpp
+++ b/src/simulation/m_rhs.fpp
@@ -702,7 +702,7 @@ contains
             call s_populate_variables_buffers(bc_type, q_cons_vf, pb_in, mv_in)
             call nvtxEndRange
         end if
-        if(.not. igr .or. dummy) then
+        if (.not. igr .or. dummy) then
             call nvtxStartRange("RHS-CONVERT")
             call s_convert_conservative_to_primitive_variables( &
                 q_cons_qp%vf, &
@@ -780,8 +780,8 @@ contains
                     call s_igr_sigma_x(q_cons_vf, rhs_vf)
                     call nvtxEndRange
                 end if
-            end if 
-            if((.not. igr) .or. dummy) then! Finite volume solve
+            end if
+            if ((.not. igr) .or. dummy) then! Finite volume solve
 
                 ! Reconstructing Primitive/Conservative Variables
                 call nvtxStartRange("RHS-WENO")
@@ -1953,7 +1953,7 @@ contains
             end if
         #:endfor
 
-         if (recon_dir == 1) then
+        if (recon_dir == 1) then
             $:GPU_PARALLEL_LOOP(private='[i,j,k,l]', collapse=4)
             do i = iv%beg, iv%end
                 do l = is3%beg, is3%end
diff --git a/src/simulation/m_riemann_solvers.fpp b/src/simulation/m_riemann_solvers.fpp
index d8b5792fe0..b93b342ea5 100644
--- a/src/simulation/m_riemann_solvers.fpp
+++ b/src/simulation/m_riemann_solvers.fpp
@@ -291,22 +291,22 @@ contains
 
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(3) :: vel_L, vel_R
-        real(wp), dimension(3) :: alpha_L, alpha_R
-        real(wp), dimension(10) :: Ys_L, Ys_R
-        real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-#:else
-        real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(num_vels) :: vel_L, vel_R
-        real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-        real(wp), dimension(num_species) :: Ys_L, Ys_R
-        real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-        real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-#:endif
-        real(wp) :: rho_L, rho_R     
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: vel_L, vel_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(10) :: Ys_L, Ys_R
+            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_vels) :: vel_L, vel_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_species) :: Ys_L, Ys_R
+            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:endif
+        real(wp) :: rho_L, rho_R
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
@@ -493,13 +493,13 @@ contains
 
                                 call get_mixture_molecular_weight(Ys_L, MW_L)
                                 call get_mixture_molecular_weight(Ys_R, MW_R)
-#:if USING_AMD
-                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-#:else
-                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
-#:endif
+                                #:if USING_AMD
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                #:else
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                #:endif
 
                                 R_gas_L = gas_constant/MW_L
                                 R_gas_R = gas_constant/MW_R
@@ -1068,25 +1068,25 @@ contains
 
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(3) :: vel_L, vel_R
-        real(wp), dimension(3) :: alpha_L, alpha_R
-        real(wp), dimension(10) :: Ys_L, Ys_R
-        real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-         real(wp), dimension(3, 3) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(num_vels) :: vel_L, vel_R
-        real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-        real(wp), dimension(num_species) :: Ys_L, Ys_R
-        real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
-        real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-        real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: vel_L, vel_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(10) :: Ys_L, Ys_R
+            real(wp), dimension(10) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+            real(wp), dimension(3, 3) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_vels) :: vel_L, vel_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_species) :: Ys_L, Ys_R
+            real(wp), dimension(num_species) :: Cp_iL, Cp_iR, Xs_L, Xs_R, Gamma_iL, Gamma_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+            real(wp), dimension(num_dims, num_dims) :: vel_grad_L, vel_grad_R       !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+        #:endif
         real(wp) :: rho_L, rho_R
-       
+
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
@@ -1274,13 +1274,13 @@ contains
                                 call get_mixture_molecular_weight(Ys_L, MW_L)
                                 call get_mixture_molecular_weight(Ys_R, MW_R)
 
-#:if USING_AMD
-                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-#:else
-                                Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
-#:endif
+                                #:if USING_AMD
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                #:else
+                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                #:endif
 
                                 R_gas_L = gas_constant/MW_L
                                 R_gas_R = gas_constant/MW_R
@@ -1319,28 +1319,28 @@ contains
                                 H_R = (E_R + pres_R)/rho_R
                             elseif (mhd .and. relativity) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
-                                Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
-                                vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
-                                vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
+                                    Ga%L = 1._wp/sqrt(1._wp - vel_L_rms)
+                                    Ga%R = 1._wp/sqrt(1._wp - vel_R_rms)
+                                    vdotB%L = vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3)
+                                    vdotB%R = vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3)
 
-                                b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
-                                b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
-                                B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
-                                B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
+                                    b4%L(1:3) = B%L(1:3)/Ga%L + Ga%L*vel_L(1:3)*vdotB%L
+                                    b4%R(1:3) = B%R(1:3)/Ga%R + Ga%R*vel_R(1:3)*vdotB%R
+                                    B2%L = B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp
+                                    B2%R = B%R(1)**2._wp + B%R(2)**2._wp + B%R(3)**2._wp
 
-                                pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
-                                pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
+                                    pres_mag%L = 0.5_wp*(B2%L/Ga%L**2._wp + vdotB%L**2._wp)
+                                    pres_mag%R = 0.5_wp*(B2%R/Ga%R**2._wp + vdotB%R**2._wp)
 
-                                ! Hard-coded EOS
-                                H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
-                                H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
+                                    ! Hard-coded EOS
+                                    H_L = 1._wp + (gamma_L + 1)*pres_L/rho_L
+                                    H_R = 1._wp + (gamma_R + 1)*pres_R/rho_R
 
-                                cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
-                                cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
+                                    cm%L(1:3) = (rho_L*H_L*Ga%L**2 + B2%L)*vel_L(1:3) - vdotB%L*B%L(1:3)
+                                    cm%R(1:3) = (rho_R*H_R*Ga%R**2 + B2%R)*vel_R(1:3) - vdotB%R*B%R(1:3)
 
-                                E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
-                                E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
+                                    E_L = rho_L*H_L*Ga%L**2 - pres_L + 0.5_wp*(B2%L + vel_L_rms*B2%L - vdotB%L**2._wp) - rho_L*Ga%L
+                                    E_R = rho_R*H_R*Ga%R**2 - pres_R + 0.5_wp*(B2%R + vel_R_rms*B2%R - vdotB%R**2._wp) - rho_R*Ga%R
                                 #:endif
                             elseif (mhd .and. .not. relativity) then
                                 pres_mag%L = 0.5_wp*(B%L(1)**2._wp + B%L(2)**2._wp + B%L(3)**2._wp)
@@ -1529,11 +1529,11 @@ contains
                             if (mhd .and. (.not. relativity)) then
                                 ! energy flux = (E + p + p_mag) * v_${XYZ}$ - B_${XYZ}$ * (v_x*B_x + v_y*B_y + v_z*B_z)
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
-                                    (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
-                                     - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
-                                     + s_M*s_P*(E_L - E_R)) &
-                                    /(s_M - s_P)
+                                    flux_rs${XYZ}$_vf(j, k, l, E_idx) = &
+                                        (s_M*(vel_R(norm_dir)*(E_R + pres_R + pres_mag%R) - B%R(norm_dir)*(vel_R(1)*B%R(1) + vel_R(2)*B%R(2) + vel_R(3)*B%R(3))) &
+                                         - s_P*(vel_L(norm_dir)*(E_L + pres_L + pres_mag%L) - B%L(norm_dir)*(vel_L(1)*B%L(1) + vel_L(2)*B%L(2) + vel_L(3)*B%L(3))) &
+                                         + s_M*s_P*(E_L - E_R)) &
+                                        /(s_M - s_P)
                                 #:endif
                             elseif (mhd .and. relativity) then
                                 ! energy flux = m_${XYZ}$ - mass flux
@@ -1756,16 +1756,16 @@ contains
                                 vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
                                 vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                if (num_dims > 1) then
-                                    vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
-                                    vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
-                                end if
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                if (num_dims > 2) then
-                                    vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
-                                    vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
-                                end if
-                                #:endif
+                                    if (num_dims > 1) then
+                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
+                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                    end if
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(1))
+                                            vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(1))
+                                        end if
+                                    #:endif
                                 #:endif
                             end do
 
@@ -1773,60 +1773,60 @@ contains
                                 flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                if (num_dims > 1) then
-                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
-
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(2) + vel_grad_R(1, 2)*vel_R(2)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(2) + vel_grad_R(2, 1)*vel_R(2))
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    if (num_dims > 2) then
-                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
-
-                                        flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(3) + vel_grad_R(1, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(3) + vel_grad_R(3, 1)*vel_R(3))
+                                    if (num_dims > 1) then
+                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
+
+                                        flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(2) + vel_grad_R(1, 2)*vel_R(2)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(2) + vel_grad_R(2, 1)*vel_R(2))
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            if (num_dims > 2) then
+                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
+
+                                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(3) + vel_grad_R(1, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(3) + vel_grad_R(3, 1)*vel_R(3))
+                                            end if
+                                        #:endif
                                     end if
-                                    #:endif
-                                end if
                                 #:endif
 
                             else if (norm_dir == 2) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
 
-                                flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
 
-                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(1) + vel_grad_R(1, 2)*vel_R(1)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(1) + vel_grad_R(2, 1)*vel_R(1))
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                if (num_dims > 2) then
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
+                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2) + vel_grad_R(1, 2)) - 0.5_wp*(vel_grad_L(2, 1) + vel_grad_R(2, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 2)*vel_L(1) + vel_grad_R(1, 2)*vel_R(1)) - 0.5_wp*(vel_grad_L(2, 1)*vel_L(1) + vel_grad_R(2, 1)*vel_R(1))
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
 
-                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(3) + vel_grad_R(2, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(3) + vel_grad_R(3, 2)*vel_R(3))
-                                end if
-                                #:endif 
+                                            flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(3) + vel_grad_R(2, 3)*vel_R(3)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(3) + vel_grad_R(3, 2)*vel_R(3))
+                                        end if
+                                    #:endif
                                 #:endif
                             else
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
 
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (-2._wp/3._wp)*0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
 
-                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(1) + vel_grad_R(1, 3)*vel_R(1)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(1) + vel_grad_R(3, 1)*vel_R(1))
+                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3) + vel_grad_R(1, 3)) - 0.5_wp*(vel_grad_L(3, 1) + vel_grad_R(3, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 3)*vel_L(1) + vel_grad_R(1, 3)*vel_R(1)) - 0.5_wp*(vel_grad_L(3, 1)*vel_L(1) + vel_grad_R(3, 1)*vel_R(1))
 
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - (4._wp/3._wp)*0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
 
-                                flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(2) + vel_grad_R(2, 3)*vel_R(2)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(2) + vel_grad_R(3, 2)*vel_R(2))
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3) + vel_grad_R(2, 3)) - 0.5_wp*(vel_grad_L(3, 2) + vel_grad_R(3, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 3)*vel_L(2) + vel_grad_R(2, 3)*vel_R(2)) - 0.5_wp*(vel_grad_L(3, 2)*vel_L(2) + vel_grad_R(3, 2)*vel_R(2))
                                 #:endif
                             end if
                         end if
@@ -1838,16 +1838,16 @@ contains
                                 vel_grad_L(i, 1) = (dqL_prim_dx_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
                                 vel_grad_R(i, 1) = (dqR_prim_dx_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                if (num_dims > 1) then
-                                    vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
-                                    vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
-                                end if
+                                    if (num_dims > 1) then
+                                        vel_grad_L(i, 2) = (dqL_prim_dy_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
+                                        vel_grad_R(i, 2) = (dqR_prim_dy_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                    end if
                                 #:endif
-                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                if (num_dims > 2) then
-                                    vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
-                                    vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
-                                end if
+                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                    if (num_dims > 2) then
+                                        vel_grad_L(i, 3) = (dqL_prim_dz_vf(momxb + i - 1)%sf(j, k, l)/Re_L(2))
+                                        vel_grad_R(i, 3) = (dqR_prim_dz_vf(momxb + i - 1)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3))/Re_R(2))
+                                    end if
                                 #:endif
                             end do
 
@@ -1855,44 +1855,44 @@ contains
                                 flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
                                 flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(1) + vel_grad_R(1, 1)*vel_R(1))
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                if (num_dims > 1) then
-                                    flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
-
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                    if (num_dims > 2) then
-                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
+                                    if (num_dims > 1) then
+                                        flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                        flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(1) + vel_grad_R(2, 2)*vel_R(1))
+
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            if (num_dims > 2) then
+                                                flux_src_vf(momxb)%sf(j, k, l) = flux_src_vf(momxb)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(1) + vel_grad_R(3, 3)*vel_R(1))
+                                            end if
+                                        #:endif
                                     end if
-                                    #:endif
-                                end if
                                 #:endif
 
                             else if (norm_dir == 2) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(2) + vel_grad_R(1, 1)*vel_R(2))
 
-                                flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
+                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(2) + vel_grad_R(2, 2)*vel_R(2))
 
-                                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                if (num_dims > 2) then
-                                    flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
-                                end if
-                                #:endif 
+                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                        if (num_dims > 2) then
+                                            flux_src_vf(momxb + 1)%sf(j, k, l) = flux_src_vf(momxb + 1)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(2) + vel_grad_R(3, 3)*vel_R(2))
+                                        end if
+                                    #:endif
                                 #:endif
                             else
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1) + vel_grad_R(1, 1))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(1, 1)*vel_L(3) + vel_grad_R(1, 1)*vel_R(3))
 
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2) + vel_grad_R(2, 2))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(2, 2)*vel_L(3) + vel_grad_R(2, 2)*vel_R(3))
 
-                                flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
+                                    flux_src_vf(momxb + 2)%sf(j, k, l) = flux_src_vf(momxb + 2)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3) + vel_grad_R(3, 3))
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) - 0.5_wp*(vel_grad_L(3, 3)*vel_L(3) + vel_grad_R(3, 3)*vel_R(3))
                                 #:endif
                             end if
 
@@ -1972,27 +1972,27 @@ contains
         integer, intent(in) :: norm_dir
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(3) :: alpha_L, alpha_R
-        real(wp), dimension(3) :: vel_L, vel_R
-#:else
-        real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
-        real(wp), dimension(num_fluids) :: alpha_L, alpha_R
-        real(wp), dimension(num_dims) :: vel_L, vel_R
-#:endif
-        
-        real(wp) :: rho_L, rho_R        
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(3) :: alpha_L, alpha_R
+            real(wp), dimension(3) :: vel_L, vel_R
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho_L, alpha_rho_R
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R
+            real(wp), dimension(num_dims) :: vel_L, vel_R
+        #:endif
+
+        real(wp) :: rho_L, rho_R
         real(wp) :: pres_L, pres_R
         real(wp) :: E_L, E_R
         real(wp) :: H_L, H_R
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
-        real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-#:else
-        real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
-        real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(10) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
+            real(wp), dimension(10) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:else
+            real(wp), dimension(num_species) :: Ys_L, Ys_R, Xs_L, Xs_R, Gamma_iL, Gamma_iR, Cp_iL, Cp_iR
+            real(wp), dimension(num_species) :: Yi_avg, Phi_avg, h_iL, h_iR, h_avg_2
+        #:endif
         real(wp) :: Cp_avg, Cv_avg, T_avg, c_sum_Yi_Phi, eps
         real(wp) :: T_L, T_R
         real(wp) :: MW_L, MW_R
@@ -2017,17 +2017,17 @@ contains
         real(wp) :: xi_L, xi_R !< Left and right wave speeds functions
         real(wp) :: xi_M, xi_P
         real(wp) :: xi_MP, xi_PP
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(3) :: R0_L, R0_R
-        real(wp), dimension(3) :: V0_L, V0_R
-        real(wp), dimension(3) :: P0_L, P0_R
-        real(wp), dimension(3) :: pbw_L, pbw_R
-#:else
-        real(wp), dimension(nb) :: R0_L, R0_R
-        real(wp), dimension(nb) :: V0_L, V0_R
-        real(wp), dimension(nb) :: P0_L, P0_R
-        real(wp), dimension(nb) :: pbw_L, pbw_R
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: R0_L, R0_R
+            real(wp), dimension(3) :: V0_L, V0_R
+            real(wp), dimension(3) :: P0_L, P0_R
+            real(wp), dimension(3) :: pbw_L, pbw_R
+        #:else
+            real(wp), dimension(nb) :: R0_L, R0_R
+            real(wp), dimension(nb) :: V0_L, V0_R
+            real(wp), dimension(nb) :: P0_L, P0_R
+            real(wp), dimension(nb) :: pbw_L, pbw_R
+        #:endif
 
         real(wp) :: alpha_L_sum, alpha_R_sum, nbub_L, nbub_R
         real(wp) :: ptilde_L, ptilde_R
@@ -2037,11 +2037,11 @@ contains
         real(wp) :: R3V2Lbar, R3V2Rbar
 
         real(wp), dimension(6) :: tau_e_L, tau_e_R
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: xi_field_L, xi_field_R
-#:else
-        real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: xi_field_L, xi_field_R
+        #:else
+            real(wp), dimension(num_dims) :: xi_field_L, xi_field_R
+        #:endif
         real(wp) :: G_L, G_R
 
         real(wp) :: vel_L_rms, vel_R_rms, vel_avg_rms
@@ -3262,13 +3262,13 @@ contains
                                     call get_mixture_molecular_weight(Ys_L, MW_L)
                                     call get_mixture_molecular_weight(Ys_R, MW_R)
 
-#:if USING_AMD
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
-#:else
-                                    Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
-                                    Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:) 
-#:endif
+                                    #:if USING_AMD
+                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights_nonparameter(:)
+                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights_nonparameter(:)
+                                    #:else
+                                        Xs_L(:) = Ys_L(:)*MW_L/molecular_weights(:)
+                                        Xs_R(:) = Ys_R(:)*MW_R/molecular_weights(:)
+                                    #:endif
 
                                     R_gas_L = gas_constant/MW_L
                                     R_gas_R = gas_constant/MW_R
@@ -3720,11 +3720,11 @@ contains
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
         ! Local variables:
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_L, alpha_R, alpha_rho_L, alpha_rho_R
+        #:endif
         type(riemann_states_vec3) :: vel
         type(riemann_states) :: rho, pres, E, H_no_mag
         type(riemann_states) :: gamma, pi_inf, qv
@@ -4682,21 +4682,21 @@ contains
         type(int_bounds_info), intent(in) :: ix, iy, iz
 
         ! Local variables
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
-        real(wp), dimension(3) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
-        real(wp), dimension(3) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
-        real(wp), dimension(3) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
-        real(wp), dimension(3) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
-        real(wp), dimension(3) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
-#:else 
-        real(wp), dimension(num_dims) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
-        real(wp), dimension(num_dims) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
-        real(wp), dimension(num_dims) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
-        real(wp), dimension(num_dims) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
-        real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
-        real(wp), dimension(num_dims) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
+            real(wp), dimension(3) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
+            real(wp), dimension(3) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
+            real(wp), dimension(3) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
+            real(wp), dimension(3) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
+            real(wp), dimension(3) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
+        #:else
+            real(wp), dimension(num_dims) :: avg_v_int       !!< Averaged interface velocity $(v_x, v_y, v_z)$ (grid directions).
+            real(wp), dimension(num_dims) :: avg_dvdx_int    !!< Averaged interface $\partial v_i/\partial x$ (grid dir 1).
+            real(wp), dimension(num_dims) :: avg_dvdy_int    !!< Averaged interface $\partial v_i/\partial y$ (grid dir 2).
+            real(wp), dimension(num_dims) :: avg_dvdz_int    !!< Averaged interface $\partial v_i/\partial z$ (grid dir 3).
+            real(wp), dimension(num_dims) :: vel_src_int !!< Interface velocity $(v_1,v_2,v_3)$ (grid directions) for viscous work.
+            real(wp), dimension(num_dims) :: stress_vector_shear !!< Shear stress vector $(\sigma_{N1}, \sigma_{N2}, \sigma_{N3})$ on N-face (grid directions).
+        #:endif
         real(wp) :: stress_normal_bulk  !!< Normal bulk stress component $\sigma_{NN}$ on N-face.
 
         real(wp) :: Re_s, Re_b        !!< Effective interface shear and bulk Reynolds numbers.
@@ -4760,12 +4760,12 @@ contains
 
                     ! Divergence in cylindrical coordinates (vx=vz_cyl, vy=vr_cyl, vz=vtheta_cyl)
                     #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                    divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff
-                    if (num_dims > 2) then
-                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                            divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff
-                        #:endif
-                    end if
+                        divergence_cyl = avg_dvdx_int(1) + avg_dvdy_int(2) + avg_v_int(2)/r_eff
+                        if (num_dims > 2) then
+                            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                divergence_cyl = divergence_cyl + avg_dvdz_int(3)/r_eff
+                            #:endif
+                        end if
                     #:endif
 
                     stress_vector_shear = 0.0_wp
@@ -4779,7 +4779,7 @@ contains
                             stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
                             if (num_dims > 1) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
+                                    stress_vector_shear(2) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
                                 #:endif
                             end if
                             if (num_dims > 2) then
@@ -4790,13 +4790,13 @@ contains
                         case (2) ! Y-face (radial normal, r_cyl)
                             if (num_dims > 1) then
                                 #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                                stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
-                                stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const
-                                if (num_dims > 2) then
-                                    #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-                                        stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
-                                    #:endif
-                                end if
+                                    stress_vector_shear(1) = (avg_dvdy_int(1) + avg_dvdx_int(2))/Re_s
+                                    stress_vector_shear(2) = (2.0_wp*avg_dvdy_int(2))/Re_s + div_v_term_const
+                                    if (num_dims > 2) then
+                                        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                                            stress_vector_shear(3) = (avg_dvdz_int(2)/r_eff - avg_v_int(3)/r_eff + avg_dvdy_int(3))/Re_s
+                                        #:endif
+                                    end if
                                 #:endif
                             else
                                 stress_vector_shear(1) = (2.0_wp*avg_dvdx_int(1))/Re_s + div_v_term_const
@@ -4865,17 +4865,17 @@ contains
         integer, intent(in) :: norm_dir
 
         ! Local variables
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3, 3) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-        real(wp), dimension(3, 3) :: current_tau_shear   !< Current shear stress tensor.
-        real(wp), dimension(3, 3) :: current_tau_bulk    !< Current bulk stress tensor.
-        real(wp), dimension(3) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
-#:else 
-        real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
-        real(wp), dimension(num_dims, num_dims) :: current_tau_shear   !< Current shear stress tensor.
-        real(wp), dimension(num_dims, num_dims) :: current_tau_bulk    !< Current bulk stress tensor.
-        real(wp), dimension(num_dims) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+            real(wp), dimension(3, 3) :: current_tau_shear   !< Current shear stress tensor.
+            real(wp), dimension(3, 3) :: current_tau_bulk    !< Current bulk stress tensor.
+            real(wp), dimension(3) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+        #:else
+            real(wp), dimension(num_dims, num_dims) :: vel_grad_avg        !< Averaged velocity gradient tensor `d(vel_i)/d(coord_j)`.
+            real(wp), dimension(num_dims, num_dims) :: current_tau_shear   !< Current shear stress tensor.
+            real(wp), dimension(num_dims, num_dims) :: current_tau_bulk    !< Current bulk stress tensor.
+            real(wp), dimension(num_dims) :: vel_src_at_interface         !< Interface velocities (u,v,w) for viscous work.
+        #:endif
         integer, dimension(3) :: idx_right_phys                     !< Physical (j,k,l) indices for right state.
 
         real(wp) :: Re_shear !< Interface shear Reynolds number.
@@ -4905,8 +4905,8 @@ contains
                                                                 dvelR_dx_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
                         if (num_dims > 1) then
                             #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-                            vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
-                                                                    dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
+                                vel_grad_avg(vel_comp_idx, 2) = 0.5_wp*(dvelL_dy_vf(vel_comp_idx)%sf(j_loop, k_loop, l_loop) + &
+                                                                        dvelR_dy_vf(vel_comp_idx)%sf(idx_right_phys(1), idx_right_phys(2), idx_right_phys(3)))
                             #:endif
                         end if
                         if (num_dims > 2) then
@@ -4988,13 +4988,13 @@ contains
         $:GPU_ROUTINE(parallelism='[seq]')
 
         ! Arguments
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3, 3), intent(in) :: vel_grad_avg
-        real(wp), dimension(3, 3), intent(out) :: tau_shear_out
-#:else 
-        real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg
-        real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3), intent(in) :: vel_grad_avg
+            real(wp), dimension(3, 3), intent(out) :: tau_shear_out
+        #:else
+            real(wp), dimension(num_dims, num_dims), intent(in) :: vel_grad_avg
+            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_shear_out
+        #:endif
         real(wp), intent(in) :: Re_shear
         real(wp), intent(in) :: divergence_v
 
@@ -5027,11 +5027,11 @@ contains
         ! Arguments
         real(wp), intent(in) :: Re_bulk
         real(wp), intent(in) :: divergence_v
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
-#:else 
-        real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3), intent(out) :: tau_bulk_out
+        #:else
+            real(wp), dimension(num_dims, num_dims), intent(out) :: tau_bulk_out
+        #:endif
 
         ! Local variables
         integer :: i_dim !< Loop iterator for diagonal components.
diff --git a/src/simulation/m_sim_helpers.fpp b/src/simulation/m_sim_helpers.fpp
index 64f9fd427d..f5d75aa148 100644
--- a/src/simulation/m_sim_helpers.fpp
+++ b/src/simulation/m_sim_helpers.fpp
@@ -97,22 +97,22 @@ contains
             & cray_inline=True)
 
         type(scalar_field), intent(in), dimension(sys_size) :: q_prim_vf
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), intent(inout), dimension(3) :: alpha
-        real(wp), intent(inout), dimension(3) :: vel
-#:else 
-        real(wp), intent(inout), dimension(num_fluids) :: alpha
-        real(wp), intent(inout), dimension(num_vels) :: vel
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), intent(inout), dimension(3) :: alpha
+            real(wp), intent(inout), dimension(3) :: vel
+        #:else
+            real(wp), intent(inout), dimension(num_fluids) :: alpha
+            real(wp), intent(inout), dimension(num_vels) :: vel
+        #:endif
         real(wp), intent(inout) :: rho, gamma, pi_inf, vel_sum, H, pres
         real(wp), intent(out) :: qv
         integer, intent(in) :: j, k, l
         real(wp), dimension(2), intent(inout) :: Re
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_rho, Gs
-#:else
-        real(wp), dimension(num_fluids) :: alpha_rho, Gs
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_rho, Gs
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_rho, Gs
+        #:endif
         real(wp) :: E, G_local
 
         integer :: i
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 86284622c6..d46a3e0793 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1375,8 +1375,8 @@ contains
         ! the modules. The preparations below DO DEPEND on the grid being complete.
         if (igr .or. dummy) then
             call s_initialize_igr_module()
-        end if 
-        if(.not. igr .or. dummy) then
+        end if
+        if (.not. igr .or. dummy) then
             if (recon_type == WENO_TYPE) then
                 call s_initialize_weno_module()
             elseif (recon_type == MUSCL_TYPE) then
diff --git a/src/simulation/m_surface_tension.fpp b/src/simulation/m_surface_tension.fpp
index 3afa1bd4b2..99bac7d722 100644
--- a/src/simulation/m_surface_tension.fpp
+++ b/src/simulation/m_surface_tension.fpp
@@ -82,11 +82,11 @@ contains
             intent(inout) :: flux_src_vf
         integer, intent(in) :: id
         type(int_bounds_info), intent(in) :: isx, isy, isz
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3, 3) :: Omega
-#:else
-        real(wp), dimension(num_dims, num_dims) :: Omega 
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3, 3) :: Omega
+        #:else
+            real(wp), dimension(num_dims, num_dims) :: Omega
+        #:endif
         real(wp) :: w1L, w1R, w2L, w2R, w3L, w3R, w1, w2, w3
         real(wp) :: normWL, normWR, normW
         integer :: j, k, l, i
@@ -138,49 +138,49 @@ contains
 
         elseif (id == 2) then
             #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-            $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]')
-            do l = isz%beg, isz%end
-                do k = isy%beg, isy%end
-                    do j = isx%beg, isx%end
+                $:GPU_PARALLEL_LOOP(collapse=3, private='[Omega, w1L, w2L, w3L, w1R, w2R, w3R, w1, w2, w3, normWL, normWR, normW]')
+                do l = isz%beg, isz%end
+                    do k = isy%beg, isy%end
+                        do j = isx%beg, isx%end
 
-                        w1L = gL_y(k, j, l, 1)
-                        w2L = gL_y(k, j, l, 2)
-                        w3L = 0._wp
-                        if (p > 0) w3L = gL_y(k, j, l, 3)
+                            w1L = gL_y(k, j, l, 1)
+                            w2L = gL_y(k, j, l, 2)
+                            w3L = 0._wp
+                            if (p > 0) w3L = gL_y(k, j, l, 3)
 
-                        w1R = gR_y(k + 1, j, l, 1)
-                        w2R = gR_y(k + 1, j, l, 2)
-                        w3R = 0._wp
-                        if (p > 0) w3R = gR_y(k + 1, j, l, 3)
+                            w1R = gR_y(k + 1, j, l, 1)
+                            w2R = gR_y(k + 1, j, l, 2)
+                            w3R = 0._wp
+                            if (p > 0) w3R = gR_y(k + 1, j, l, 3)
 
-                        normWL = gL_y(k, j, l, num_dims + 1)
-                        normWR = gR_y(k + 1, j, l, num_dims + 1)
+                            normWL = gL_y(k, j, l, num_dims + 1)
+                            normWR = gR_y(k + 1, j, l, num_dims + 1)
 
-                        w1 = (w1L + w1R)/2._wp
-                        w2 = (w2L + w2R)/2._wp
-                        w3 = (w3L + w3R)/2._wp
-                        normW = (normWL + normWR)/2._wp
+                            w1 = (w1L + w1R)/2._wp
+                            w2 = (w2L + w2R)/2._wp
+                            w3 = (w3L + w3R)/2._wp
+                            normW = (normWL + normWR)/2._wp
 
-                        if (normW > capillary_cutoff) then
-                            @:compute_capillary_stress_tensor()
+                            if (normW > capillary_cutoff) then
+                                @:compute_capillary_stress_tensor()
 
-                            do i = 1, num_dims
+                                do i = 1, num_dims
 
-                                flux_src_vf(momxb + i - 1)%sf(j, k, l) = &
-                                    flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(2, i)
+                                    flux_src_vf(momxb + i - 1)%sf(j, k, l) = &
+                                        flux_src_vf(momxb + i - 1)%sf(j, k, l) + Omega(2, i)
 
-                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + &
-                                                                 Omega(2, i)*vSrc_rsy_vf(k, j, l, i)
+                                    flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + &
+                                                                     Omega(2, i)*vSrc_rsy_vf(k, j, l, i)
 
-                            end do
+                                end do
 
-                            flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + &
-                                                             sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsy_vf(k, j, l, 2)
-                        end if
+                                flux_src_vf(E_idx)%sf(j, k, l) = flux_src_vf(E_idx)%sf(j, k, l) + &
+                                                                 sigma*c_divs(num_dims + 1)%sf(j, k, l)*vSrc_rsy_vf(k, j, l, 2)
+                            end if
+                        end do
                     end do
                 end do
-            end do
-            $:END_GPU_PARALLEL_LOOP()
+                $:END_GPU_PARALLEL_LOOP()
             #:endif
 
         elseif (id == 3) then
diff --git a/src/simulation/m_time_steppers.fpp b/src/simulation/m_time_steppers.fpp
index c34792f554..3d1d0b4a2f 100644
--- a/src/simulation/m_time_steppers.fpp
+++ b/src/simulation/m_time_steppers.fpp
@@ -464,16 +464,16 @@ contains
         @:ALLOCATE(bc_type(1,1)%sf(0:0,0:n,0:p))
         @:ALLOCATE(bc_type(1,2)%sf(0:0,0:n,0:p))
         #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
-        if (n > 0) then
-            @:ALLOCATE(bc_type(2,1)%sf(-buff_size:m+buff_size,0:0,0:p))
-            @:ALLOCATE(bc_type(2,2)%sf(-buff_size:m+buff_size,0:0,0:p))
-            #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
-            if (p > 0) then
-                @:ALLOCATE(bc_type(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
-                @:ALLOCATE(bc_type(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
+            if (n > 0) then
+                @:ALLOCATE(bc_type(2,1)%sf(-buff_size:m+buff_size,0:0,0:p))
+                @:ALLOCATE(bc_type(2,2)%sf(-buff_size:m+buff_size,0:0,0:p))
+                #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+                    if (p > 0) then
+                        @:ALLOCATE(bc_type(3,1)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
+                        @:ALLOCATE(bc_type(3,2)%sf(-buff_size:m+buff_size,-buff_size:n+buff_size,0:0))
+                    end if
+                #:endif
             end if
-            #:endif
-        end if
         #:endif
 
         do i = 1, num_dims
@@ -532,8 +532,8 @@ contains
                 if (run_time_info) then
                     if (igr .or. dummy) then
                         call s_write_run_time_information(q_cons_ts(1)%vf, t_step)
-                    end if 
-                    if(.not. igr .or. dummy) then
+                    end if
+                    if (.not. igr .or. dummy) then
                         call s_write_run_time_information(q_prim_vf, t_step)
                     end if
                 end if
@@ -722,15 +722,15 @@ contains
     impure subroutine s_compute_dt()
 
         real(wp) :: rho        !< Cell-avg. density
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: vel        !< Cell-avg. velocity
-        real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
-#:else 
-        real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
-        real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(3) :: alpha      !< Cell-avg. volume fraction
+        #:else
+            real(wp), dimension(num_vels) :: vel        !< Cell-avg. velocity
+            real(wp), dimension(num_fluids) :: alpha      !< Cell-avg. volume fraction
+        #:endif
         real(wp) :: vel_sum    !< Cell-avg. velocity sum
-        real(wp) :: pres       !< Cell-avg. pressure   
+        real(wp) :: pres       !< Cell-avg. pressure
         real(wp) :: gamma      !< Cell-avg. sp. heat ratio
         real(wp) :: pi_inf     !< Cell-avg. liquid stiffness function
         real(wp) :: qv         !< Cell-avg. fluid reference energy
diff --git a/src/simulation/m_viscous.fpp b/src/simulation/m_viscous.fpp
index 3cd6099539..e81a243373 100644
--- a/src/simulation/m_viscous.fpp
+++ b/src/simulation/m_viscous.fpp
@@ -72,13 +72,13 @@ contains
 
         real(wp) :: rho_visc, gamma_visc, pi_inf_visc, alpha_visc_sum  !< Mixture variables
         real(wp), dimension(2) :: Re_visc
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: alpha_visc, alpha_rho_visc
-        real(wp), dimension(3, 3) :: tau_Re
-#:else 
-        real(wp), dimension(num_fluids) :: alpha_visc, alpha_rho_visc
-        real(wp), dimension(num_dims, num_dims) :: tau_Re
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: alpha_visc, alpha_rho_visc
+            real(wp), dimension(3, 3) :: tau_Re
+        #:else
+            real(wp), dimension(num_fluids) :: alpha_visc, alpha_rho_visc
+            real(wp), dimension(num_dims, num_dims) :: tau_Re
+        #:endif
 
         integer :: i, j, k, l, q !< Generic loop iterator
 
diff --git a/src/simulation/m_weno.fpp b/src/simulation/m_weno.fpp
index ed1f5576ae..32bd2a2d6e 100644
--- a/src/simulation/m_weno.fpp
+++ b/src/simulation/m_weno.fpp
@@ -647,21 +647,21 @@ contains
         integer, intent(in) :: weno_dir
         type(int_bounds_info), intent(in) :: is1_weno_d, is2_weno_d, is3_weno_d
 
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD 
-        real(wp), dimension(-3:2) :: dvd
-        real(wp), dimension(0:4) :: poly
-        real(wp), dimension(0:4) :: alpha
-        real(wp), dimension(0:4) :: omega
-        real(wp), dimension(0:4) :: beta
-        real(wp), dimension(0:4) :: delta
-#:else
-        real(wp), dimension(-weno_polyn:weno_polyn - 1) :: dvd
-        real(wp), dimension(0:weno_num_stencils) :: poly
-        real(wp), dimension(0:weno_num_stencils) :: alpha
-        real(wp), dimension(0:weno_num_stencils) :: omega
-        real(wp), dimension(0:weno_num_stencils) :: beta
-        real(wp), dimension(0:weno_num_stencils) :: delta
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(-3:2) :: dvd
+            real(wp), dimension(0:4) :: poly
+            real(wp), dimension(0:4) :: alpha
+            real(wp), dimension(0:4) :: omega
+            real(wp), dimension(0:4) :: beta
+            real(wp), dimension(0:4) :: delta
+        #:else
+            real(wp), dimension(-weno_polyn:weno_polyn - 1) :: dvd
+            real(wp), dimension(0:weno_num_stencils) :: poly
+            real(wp), dimension(0:weno_num_stencils) :: alpha
+            real(wp), dimension(0:weno_num_stencils) :: omega
+            real(wp), dimension(0:weno_num_stencils) :: beta
+            real(wp), dimension(0:weno_num_stencils) :: delta
+        #:endif
         real(wp), dimension(-3:3) :: v ! temporary field value array for clarity (WENO7 only)
         real(wp) :: tau
 
@@ -730,8 +730,8 @@ contains
                                 do i = 1, v_size
                                     ! reconstruct from left side
 
-                                    alpha(:) = 0._wp 
-                                    omega(:) = 0._wp 
+                                    alpha(:) = 0._wp
+                                    omega(:) = 0._wp
                                     beta(:) = weno_eps
 
                                     dvd(0) = v_rs_ws_${XYZ}$ (j + 1, k, l, i) &
@@ -756,7 +756,7 @@ contains
                                         alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                         omega = alpha/sum(alpha)
                                         alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                     *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                     elseif (wenoz) then
                                         ! Borges, et al. (2008)
@@ -784,7 +784,7 @@ contains
                                         alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                         omega = alpha/sum(alpha)
                                         alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                     *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                     elseif (wenoz) then
 
@@ -816,8 +816,8 @@ contains
                                     do i = 1, v_size
                                         ! reconstruct from left side
 
-                                        alpha(:) = 0._wp 
-                                        omega(:) = 0._wp 
+                                        alpha(:) = 0._wp
+                                        omega(:) = 0._wp
                                         delta(:) = 0._wp
                                         beta(:) = weno_eps
 
@@ -860,7 +860,7 @@ contains
                                             alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
                                             alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                    *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                         *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
 
@@ -870,10 +870,10 @@ contains
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
                                                 alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))) ! Equation 28 (note: weno_eps was already added to beta)
-                                            end do 
+                                            end do
 
                                         elseif (teno) then
-                                             ! Fu, et al. (2016)
+                                            ! Fu, et al. (2016)
                                             ! Fu''s code: https://dx.doi.org/10.13140/RG.2.2.36250.34247
                                             tau = abs(beta(2) - beta(0))
                                             $:GPU_LOOP(parallelism='[seq]')
@@ -882,14 +882,14 @@ contains
                                                 alpha(q) = (alpha(q)**3._wp)**2._wp          ! Equation 22 cont. (some CPU compilers cannot optimize x**6.0)
                                             end do
                                             omega = alpha/sum(alpha)                    ! Equation 25 (reuse omega as xi)
-                                            
+
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                if(omega(q) < teno_CT) then             ! Equation 26
-                                                    delta(q) = 0._wp 
-                                                else 
-                                                    delta(q) = 1._wp 
-                                                end if 
+                                                if (omega(q) < teno_CT) then             ! Equation 26
+                                                    delta(q) = 0._wp
+                                                else
+                                                    delta(q) = 1._wp
+                                                end if
                                                 alpha(q) = delta(q)*d_cbL_${XYZ}$ (q, j) ! Equation 27
                                             end do
                                         end if
@@ -910,7 +910,6 @@ contains
                                                   + poly_coef_cbR_${XYZ}$ (j, 2, 0)*dvd(-1) &
                                                   + poly_coef_cbR_${XYZ}$ (j, 2, 1)*dvd(-2)
 
-
                                         if (wenojs) then
                                             alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
 
@@ -918,19 +917,19 @@ contains
                                             alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
                                             alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                    *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                         *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
 
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))) 
-                                            end do 
+                                                alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q)))
+                                            end do
 
                                         elseif (teno) then
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j) 
+                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j)
                                             end do
                                         end if
 
@@ -963,8 +962,8 @@ contains
                                     $:GPU_LOOP(parallelism='[seq]')
                                     do i = 1, v_size
 
-                                        alpha(:) = 0._wp 
-                                        omega(:) = 0._wp 
+                                        alpha(:) = 0._wp
+                                        omega(:) = 0._wp
                                         delta(:) = 0._wp
                                         beta(:) = weno_eps
 
@@ -1078,7 +1077,7 @@ contains
                                             alpha(0:weno_num_stencils) = d_cbL_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
                                             alpha(0:weno_num_stencils) = (d_cbL_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbL_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                    *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                         *(omega(0:weno_num_stencils)/(d_cbL_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbL_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
                                             ! Castro, et al. (2010)
@@ -1087,7 +1086,7 @@ contains
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
                                                 alpha(q) = d_cbL_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability
-                                            end do 
+                                            end do
 
                                         elseif (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
@@ -1095,14 +1094,14 @@ contains
                                                 alpha = 1._wp + tau/beta
                                                 alpha = (alpha**3._wp)**2._wp ! some CPU compilers cannot optimize x**6.0
                                                 omega = alpha/sum(alpha)
-                                                
+
                                                 $:GPU_LOOP(parallelism='[seq]')
                                                 do q = 0, weno_num_stencils
-                                                    if(omega(q) < teno_CT) then             ! Equation 26
-                                                        delta(q) = 0._wp 
-                                                    else 
-                                                        delta(q) = 1._wp 
-                                                    end if 
+                                                    if (omega(q) < teno_CT) then             ! Equation 26
+                                                        delta(q) = 0._wp
+                                                    else
+                                                        delta(q) = 1._wp
+                                                    end if
                                                     alpha(q) = delta(q)*d_cbL_${XYZ}$ (q, j) ! Equation 27
                                                 end do
                                             #:endif
@@ -1112,7 +1111,7 @@ contains
 
                                         vL_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2) + omega(3)*poly(3)
 
-                                        if(teno) then 
+                                        if (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
                                                 vL_rs_vf_${XYZ}$ (j, k, l, i) = vL_rs_vf_${XYZ}$ (j, k, l, i) + omega(4)*poly(4)
                                             #:endif
@@ -1152,19 +1151,19 @@ contains
                                             alpha(0:weno_num_stencils) = d_cbR_${XYZ}$ (0:weno_num_stencils, j)/(beta(0:weno_num_stencils)**2._wp)
                                             omega = alpha/sum(alpha)
                                             alpha(0:weno_num_stencils) = (d_cbR_${XYZ}$ (0:weno_num_stencils, j)*(1._wp + d_cbR_${XYZ}$ (0:weno_num_stencils, j) - 3._wp*omega(0:weno_num_stencils)) + omega(0:weno_num_stencils)**2._wp) &
-                                                    *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
+                                                                         *(omega(0:weno_num_stencils)/(d_cbR_${XYZ}$ (0:weno_num_stencils, j)**2._wp + omega(0:weno_num_stencils)*(1._wp - 2._wp*d_cbR_${XYZ}$ (0:weno_num_stencils, j))))
 
                                         elseif (wenoz) then
 
-                                           $:GPU_LOOP(parallelism='[seq]')
+                                            $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
                                                 alpha(q) = d_cbR_${XYZ}$ (q, j)*(1._wp + (tau/beta(q))**wenoz_q) ! wenoz_q = 2,3,4 for stability
-                                            end do 
+                                            end do
 
                                         elseif (teno) then
                                             $:GPU_LOOP(parallelism='[seq]')
                                             do q = 0, weno_num_stencils
-                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j) 
+                                                alpha(q) = delta(q)*d_cbR_${XYZ}$ (q, j)
                                             end do
                                         end if
 
@@ -1172,7 +1171,7 @@ contains
 
                                         vR_rs_vf_${XYZ}$ (j, k, l, i) = omega(0)*poly(0) + omega(1)*poly(1) + omega(2)*poly(2) + omega(3)*poly(3)
 
-                                        if(teno) then 
+                                        if (teno) then
                                             #:if not MFC_CASE_OPTIMIZATION or weno_num_stencils > 3
                                                 vR_rs_vf_${XYZ}$ (j, k, l, i) = vR_rs_vf_${XYZ}$ (j, k, l, i) + omega(4)*poly(4)
                                             #:endif

From 7d13d5d5a99974fe1192159a2c2a7a31893eaf24 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Wed, 28 Jan 2026 23:47:09 -0500
Subject: [PATCH 09/21] Cmake cleanup

---
 CMakeLists.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 082bff3e65..66d7326c3d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -460,9 +460,13 @@ function(MFC_SETUP_TARGET)
         if (MFC_MPI AND ARGS_MPI)
             find_package(MPI COMPONENTS Fortran REQUIRED)
 	    
-	    target_compile_definitions(${a_target} PRIVATE MFC_MPI)
-	    target_compile_options(${a_target} PRIVATE "$ENV{CRAY_MPICH_INC}")
-            target_link_libraries     (${a_target} PRIVATE $ENV{CRAY_MPICH_LIB})
+            target_compile_definitions(${a_target} PRIVATE MFC_MPI)
+            if(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+                target_compile_options(${a_target} PRIVATE "$ENV{CRAY_MPICH_INC}")
+                target_link_libraries(${a_target} PRIVATE $ENV{CRAY_MPICH_LIB})
+            else() 
+                target_link_libraries(${a_target} PRIVATE MPI::MPI_Fortran)
+            endif()
         endif()
 
         if (ARGS_SILO)
@@ -528,8 +532,8 @@ function(MFC_SETUP_TARGET)
                     target_compile_options(${a_target} PRIVATE -fopenmp)
                     target_link_options(${a_target} PRIVATE -fopenmp)
                 elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
-		    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
-		    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+                    target_compile_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
+                    target_link_options(${a_target} PRIVATE -fopenmp --offload-arch=gfx90a)
                 endif()
             endif()
 

From eeb5e25c240ba1f7bf26c7b3ffe984cee518d042 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 01:45:57 -0500
Subject: [PATCH 10/21] toolchain and CI workarounds

---
 .github/workflows/bench.yml                   |  8 ++
 .github/workflows/frontier_amd/bench.sh       | 22 ++++++
 .github/workflows/frontier_amd/build.sh       | 26 +++++++
 .../workflows/frontier_amd/submit-bench.sh    | 54 ++++++++++++++
 .github/workflows/frontier_amd/submit.sh      | 56 ++++++++++++++
 .github/workflows/frontier_amd/test.sh        | 20 +++++
 .github/workflows/test.yml                    | 19 +++++
 toolchain/bootstrap/modules.sh                | 26 ++++++-
 toolchain/modules                             |  5 ++
 toolchain/templates/frontier_amd.mako         | 73 +++++++++++++++++++
 10 files changed, 306 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/frontier_amd/bench.sh
 create mode 100644 .github/workflows/frontier_amd/build.sh
 create mode 100644 .github/workflows/frontier_amd/submit-bench.sh
 create mode 100644 .github/workflows/frontier_amd/submit.sh
 create mode 100644 .github/workflows/frontier_amd/test.sh
 create mode 100644 toolchain/templates/frontier_amd.mako

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 2ccdfca87a..0653883140 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -70,6 +70,14 @@ jobs:
             device: gpu
             interface: omp
             build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
+          - cluster: frontier
+            name: Oak Ridge | Frontier (AMD)
+            group: phoenix
+            labels: frontier_amd
+            flag: famd
+            device: gpu
+            interface: omp
+            build_script: "bash .github/workflows/frontier_amd/build.sh gpu omp bench"
     runs-on:
       group: ${{ matrix.group }}
       labels: ${{ matrix.labels }}
diff --git a/.github/workflows/frontier_amd/bench.sh b/.github/workflows/frontier_amd/bench.sh
new file mode 100644
index 0000000000..6e01687e79
--- /dev/null
+++ b/.github/workflows/frontier_amd/bench.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+n_ranks=12
+device_opts=""
+if [ "$job_device" = "gpu" ]; then
+    gpus=$(rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' ')
+    n_ranks=$(echo "$gpus" | wc -w)         # number of GPUs on node
+    gpu_ids=$(echo "$gpus" | tr ' ' '\n' | tr '\n' ' ' | sed 's/ $//')  # GPU IDs from rocm-smi
+    device_opts+="--gpu"
+    if [ "$job_interface" = "acc" ]; then
+        device_opts+=" acc"
+    elif [ "$job_interface" = "omp" ]; then
+        device_opts+=" mp"
+    fi
+    device_opts+=" -g $gpu_ids"
+fi
+
+if [ "$job_device" = "gpu" ]; then
+    ./mfc.sh bench --mem 12 -j $n_ranks -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
+else
+    ./mfc.sh bench --mem 1 -j $(nproc) -o "$job_slug.yaml" -- -c frontier_amd $device_opts -n $n_ranks
+fi
diff --git a/.github/workflows/frontier_amd/build.sh b/.github/workflows/frontier_amd/build.sh
new file mode 100644
index 0000000000..b8b1f7051c
--- /dev/null
+++ b/.github/workflows/frontier_amd/build.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+job_device=$1
+job_interface=$2
+run_bench=$3
+build_opts=""
+if [ "$job_device" = "gpu" ]; then
+  build_opts+="--gpu"
+  if [ "$job_interface" = "acc" ]; then
+      build_opts+=" acc"
+  elif [ "$job_interface" = "omp" ]; then
+      build_opts+=" mp"
+  fi
+fi
+
+. ./mfc.sh load -c famd -m g
+
+if [ "$run_bench" == "bench" ]; then
+    for dir in benchmarks/*/; do
+        dirname=$(basename "$dir")
+        ./mfc.sh run "$dir/case.py" --case-optimization -j 8 --dry-run $build_opts
+    done
+else
+    ./mfc.sh test -a --dry-run -j 8 $build_opts
+fi
+
diff --git a/.github/workflows/frontier_amd/submit-bench.sh b/.github/workflows/frontier_amd/submit-bench.sh
new file mode 100644
index 0000000000..5445c75aac
--- /dev/null
+++ b/.github/workflows/frontier_amd/submit-bench.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+set -e
+
+usage() {
+    echo "Usage: $0 [script.sh] [cpu|gpu]"
+}
+
+if [ ! -z "$1" ]; then
+    sbatch_script_contents=`cat $1`
+else
+    usage
+    exit 1
+fi
+
+if [ "$2" = "cpu" ]; then
+    sbatch_device_opts="\
+#SBATCH -n 32                       # Number of cores required"
+elif [ "$2" = "gpu" ]; then
+    sbatch_device_opts="\
+#SBATCH -n 8                       # Number of cores required"
+else
+    usage; exit 1
+fi
+
+
+job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+
+sbatch <<EOT
+#!/bin/bash
+#SBATCH -JMFC-$job_slug            # Job name
+#SBATCH -A CFD154                  # charge account
+#SBATCH -N 1                       # Number of nodes required
+$sbatch_device_opts
+#SBATCH -t 02:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -o$job_slug.out            # Combined output and error messages file
+#SBATCH -p extended                # Extended partition for shorter queues
+
+set -e
+set -x
+
+cd "\$SLURM_SUBMIT_DIR"
+echo "Running in $(pwd):"
+
+job_slug="$job_slug"
+job_device="$2"
+job_interface="$3"
+
+. ./mfc.sh load -c famd -m g
+
+$sbatch_script_contents
+
+EOT
+
diff --git a/.github/workflows/frontier_amd/submit.sh b/.github/workflows/frontier_amd/submit.sh
new file mode 100644
index 0000000000..5c2cb27afa
--- /dev/null
+++ b/.github/workflows/frontier_amd/submit.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -e
+
+usage() {
+    echo "Usage: $0 [script.sh] [cpu|gpu]"
+}
+
+if [ ! -z "$1" ]; then
+    sbatch_script_contents=`cat $1`
+else
+    usage
+    exit 1
+fi
+
+if [ "$2" = "cpu" ]; then
+    sbatch_device_opts="\
+#SBATCH -n 32                       # Number of cores required"
+elif [ "$2" = "gpu" ]; then
+    sbatch_device_opts="\
+#SBATCH -n 8                       # Number of cores required"
+else
+    usage
+    exit 1
+fi
+
+
+job_slug="`basename "$1" | sed 's/\.sh$//' | sed 's/[^a-zA-Z0-9]/-/g'`-$2-$3"
+
+sbatch <<EOT
+#!/bin/bash
+#SBATCH -J MFC-$job_slug            # Job name
+#SBATCH -A CFD154                  # charge account
+#SBATCH -N 1                       # Number of nodes required
+$sbatch_device_opts
+#SBATCH -t 05:59:00                # Duration of the job (Ex: 15 mins)
+#SBATCH -o$job_slug.out            # Combined output and error messages file
+#SBATCH -p extended                # Extended partition for shorter queues
+#SBATCH -W                         # Do not exit until the submitted job terminates.
+
+set -e
+set -x
+
+cd "\$SLURM_SUBMIT_DIR"
+echo "Running in $(pwd):"
+
+job_slug="$job_slug"
+job_device="$2"
+job_interface="$3"
+
+. ./mfc.sh load -c famd -m $([ "$2" = "gpu" ] && echo "g" || echo "c")
+
+$sbatch_script_contents
+
+EOT
+
diff --git a/.github/workflows/frontier_amd/test.sh b/.github/workflows/frontier_amd/test.sh
new file mode 100644
index 0000000000..10d63fb767
--- /dev/null
+++ b/.github/workflows/frontier_amd/test.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+
+gpus=`rocm-smi --showid | awk '{print $1}' | grep -Eo '[0-9]+' | uniq | tr '\n' ' '`
+ngpus=`echo "$gpus" | tr -d '[:space:]' | wc -c`
+
+device_opts=""
+if [ "$job_device" = "gpu" ]; then
+    device_opts+="--gpu"
+    if [ "$job_interface" = "acc" ]; then
+        device_opts+=" acc"
+    elif [ "$job_interface" = "omp" ]; then
+        device_opts+=" mp"
+    fi
+fi
+
+if [ "$job_device" = "gpu" ]; then
+    ./mfc.sh test -a  --max-attempts 3 -j $ngpus $device_opts -- -c frontier_amd
+else
+    ./mfc.sh test -a --max-attempts 3 -j 32 --no-gpu -- -c frontier_amd
+fi
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 251b276ace..b7a6a6b9e9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -127,6 +127,10 @@ jobs:
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
+          - lbl: 'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'gpu'
+            interface: 'omp'
     runs-on:
       group:  phoenix
       labels: ${{ matrix.lbl }}
@@ -150,6 +154,14 @@ jobs:
         if:   matrix.lbl == 'frontier'
         run:  bash .github/workflows/frontier/submit.sh .github/workflows/frontier/test.sh ${{matrix.device}} ${{ matrix.interface }}
 
+      - name: Build
+        if:   matrix.lbl == 'frontier_amd'
+        run:  bash .github/workflows/frontier_amd/build.sh ${{ matrix.device }} ${{ matrix.interface }}
+
+      - name: Test
+        if:   matrix.lbl == 'frontier_amd'
+        run:  bash .github/workflows/frontier_amd/submit.sh .github/workflows/frontier_amd/test.sh ${{matrix.device}} ${{ matrix.interface }}  
+
       - name: Print Logs
         if:   always()
         run:  cat test-${{ matrix.device }}-${{ matrix.interface }}.out
@@ -160,3 +172,10 @@ jobs:
         with:
           name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }}
           path: test-${{ matrix.device }}-${{ matrix.interface }}.out
+
+      - name: Archive Logs
+        uses: actions/upload-artifact@v4
+        if:   matrix.lbl == 'frontier_amd'
+        with:
+          name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }}
+          path: test-${{ matrix.device }}-${{ matrix.interface }}.out
diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index bc83d503a6..5cfea6867d 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -8,7 +8,7 @@ show_help() {
   echo "Options:"
   echo "  -h, --help                  Display this help message and exit."
   echo "  -c, --computer COMPUTER     Configures for COMPUTER environment."
-  echo "                 Options:     Ascent (a) | Frontier (f) | Summit (s) | Wombat (w)"
+  echo "                 Options:     Ascent (a) | Frontier (f) | Frontier_amd (famd) | Summit (s) | Wombat (w)"
   echo "                              Bridges2 (b) | Expanse (e) | Delta (d) | DeltaAI (dai)"
   echo "                              Phoenix (p) | Richardson (r) | Oscar (o)"
   echo "                              Carpenter Cray (cc) | Carpenter GNU (c) |  Nautilus (n)"
@@ -38,7 +38,7 @@ done
 # Get computer (if not supplied in command line)
 if [ -v $u_c ]; then
     log   "Select a system:"
-    log   "$G""ORNL$W:    Ascent     (a) | Frontier (f) | Summit (s) | Wombat (w)"
+    log   "$G""ORNL$W:    Ascent     (a) | Frontier (f) | Frontier_amd (famd) | Summit (s) | Wombat (w)"
     log   "$B""LLNL $W:   Tuolumne   (tuo)"
     log   "$C""ACCESS$W:  Bridges2   (b) | Expanse (e) | Delta  (d) | DeltaAI (dai)"
     log   "$Y""Gatech$W:  Phoenix    (p)"
@@ -118,11 +118,31 @@ if [ $(echo "$VARIABLES" | grep = | wc -c) -gt 0 ]; then
 fi
 
 # Don't check for Cray paths on Carpenter, otherwise do check if they exist
-if [ ! -z ${CRAY_LD_LIBRARY_PATH+x} ] && [ "$u_c" '!=' 'c' ]; then
+if [ ! -z ${CRAY_LD_LIBRARY_PATH+x} ] && [ "$u_c" '!=' 'c' ] &&  [ "$u_c" '!=' 'famd' ] ; then
     ok "Found $M\$CRAY_LD_LIBRARY_PATH$CR. Prepending to $M\$LD_LIBRARY_PATH$CR."
     export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
 fi
 
+if[ "$u_c" '==' 'famd' ]; then 
+    export OLCF_AFAR_ROOT="/sw/crusher/ums/compilers/afar/rocm-afar-8873-drop-22.2.0"
+
+    export PATH=${OLCF_AFAR_ROOT}/lib/llvm/bin:${PATH}
+    export LD_LIBRARY_PATH=${OLCF_AFAR_ROOT}/lib:${OLCF_AFAR_ROOT}/lib/llvm/lib:${LD_LIBRARY_PATH}
+
+    export CRAY_MPICH_INC="-I${OLCF_AFAR_ROOT}/include/mpich3.4a2"
+    export CRAY_HIPFORT_INC="-I${OLCF_AFAR_ROOT}/include/hipfort/amdgcn"
+    export CRAY_HIPFORT_LIB="-L${OLCF_AFAR_ROOT}/lib -lhipfft"
+    export CRAY_HIP_INC="-I${OLCF_AFAR_ROOT}/include/hip"
+    export CRAY_MPICH_LIB="-L${CRAY_MPICH_PREFIX}/lib \
+                        ${CRAY_PMI_POST_LINK_OPTS} \
+                        -lmpifort_amd -lmpi_amd -lmpi -lpmi -lpmi2"
+    export LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${CRAY_LD_LIBRARY_PATH}" 
+    export CMAKE_PREFIX_PATH="${OLCF_AFAR_ROOT}:${CMAKE_PREFIX_PATH}"
+    export FC="${OLCF_AFAR_ROOT}/bin/amdflang"
+
+    unset MPICH_GPU_SUPPORT_ENABLED
+fi
+
 ok 'All modules and environment variables have been loaded.'
 
 return
diff --git a/toolchain/modules b/toolchain/modules
index c8ed12940b..3b53fd9c3d 100644
--- a/toolchain/modules
+++ b/toolchain/modules
@@ -45,6 +45,11 @@ f-all cpe/25.03 rocm/6.3.1
 f-all cray-fftw cray-hdf5 python cmake
 f-gpu python craype-accel-amd-gfx90a rocprofiler-compute/3.0.0
 
+famd    OLCF Frontier AMD 
+famd-all python cmake
+famd-all cpe/25.09
+famd-all PrgEnv-amd
+
 tuo     OLCF Tuolumne
 tuo-all cpe/25.03 rocm/6.3.1
 tuo-all cray-fftw/3.3.10.9 cray-hdf5 python/3.12.2 cmake
diff --git a/toolchain/templates/frontier_amd.mako b/toolchain/templates/frontier_amd.mako
new file mode 100644
index 0000000000..39064b8c64
--- /dev/null
+++ b/toolchain/templates/frontier_amd.mako
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+
+<%namespace name="helpers" file="helpers.mako"/>
+
+% if engine == 'batch':
+#SBATCH --nodes=${nodes}
+#SBATCH --ntasks-per-node=${tasks_per_node}
+#SBATCH --job-name="${name}"
+#SBATCH --output="${name}.out"
+#SBATCH --time=${walltime}
+#SBATCH --cpus-per-task=7
+#SBATCH -C nvme
+% if gpu_enabled:
+#SBATCH --gpus-per-task=1
+#SBATCH --gpu-bind=closest
+% endif
+% if account:
+#SBATCH --account=${account}
+% endif
+% if partition:
+#SBATCH --partition=${partition}
+% endif
+% if quality_of_service:
+#SBATCH --qos=${quality_of_service}
+% endif
+% if email:
+#SBATCH --mail-user=${email}
+#SBATCH --mail-type="BEGIN, END, FAIL"
+% endif
+% endif
+
+${helpers.template_prologue()}
+
+ok ":) Loading modules:\n"
+cd "${MFC_ROOT_DIR}"
+% if engine == 'batch':
+. ./mfc.sh load -c famd -m ${'g' if gpu_enabled else 'c'}
+% endif
+cd - > /dev/null
+echo
+
+ulimit -s unlimited
+
+% for target in targets:
+    ${helpers.run_prologue(target)}
+
+    % if engine == 'batch':
+        # Broadcast binary to compute nodes
+        sbcast --send-libs -pf ${target.get_install_binpath(case)} /mnt/bb/$USER/${target.name}
+    % endif
+
+    % if not mpi:
+        (set -x; ${profiler} "${target.get_install_binpath(case)}")
+    % else:
+        (set -x; srun --unbuffered \
+        % if engine == 'interactive':
+                --unbuffered --nodes ${nodes} --ntasks-per-node ${tasks_per_node} \
+                --cpus-per-task 7                                    \
+            % if gpu_enabled:
+                --gpus-per-task 1 --gpu-bind closest                 \
+            % endif
+            ${profiler} "${target.get_install_binpath(case)}")
+        % else:
+            ${profiler} "/mnt/bb/$USER/${target.name}")
+        % endif
+    % endif
+
+    ${helpers.run_epilogue(target)}
+
+    echo
+% endfor
+
+${helpers.template_epilogue()}

From 3c3d7f1b0836ede80a9dadcecfb41d59a180555e Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 01:57:48 -0500
Subject: [PATCH 11/21] cleanup cmake

---
 CMakeLists.txt | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 66d7326c3d..46f1e17dc8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,9 +185,11 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
         add_link_options("SHELL: -K trap=fp" "SHELL: -G2")
     endif()
 
-elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang")
     add_compile_options(
-	    #$<$<COMPILE_LANGUAGE:Fortran>:-Wall>
+        $<$<COMPILE_LANGUAGE:Fortran>:-Mfreeform>
+        $<$<COMPILE_LANGUAGE:Fortran>:-Mpreprocess>
+        $<$<COMPILE_LANGUAGE:Fortran>:-fdefault-real-8>
     )
 elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel")
     add_compile_options($<$<COMPILE_LANGUAGE:Fortran>:-free>)
@@ -381,7 +383,7 @@ macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP)
                                  -D MFC_${${target}_UPPER}
                                  -D MFC_COMPILER="${CMAKE_Fortran_COMPILER_ID}"
                                  -D MFC_GPU_MODE="${MFC_GPU_MODE}"
-				 -D MFC_CASE_OPTIMIZATION=False
+				                 -D MFC_CASE_OPTIMIZATION=False
                                  -D chemistry=False
                                  --line-numbering
                                  --no-folding
@@ -439,9 +441,9 @@ function(MFC_SETUP_TARGET)
 
     foreach (a_target ${IPO_TARGETS})
         set_target_properties(${a_target} PROPERTIES Fortran_PREPROCESS ON)
-	message(STATUS ${CMAKE_Fortran_COMPILER_ID})
+	   message(STATUS ${CMAKE_Fortran_COMPILER_ID})
 
-	target_include_directories(${a_target} PRIVATE
+       target_include_directories(${a_target} PRIVATE
             "${CMAKE_SOURCE_DIR}/src/common"
             "${CMAKE_SOURCE_DIR}/src/common/include"
             "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}")
@@ -484,12 +486,14 @@ function(MFC_SETUP_TARGET)
                 if (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC" OR CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")
                     find_package(CUDAToolkit REQUIRED)
                     target_link_libraries(${a_target} PRIVATE CUDA::cudart CUDA::cufft)
-                else()
-			#find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
-		    target_link_libraries(${a_target} PRIVATE $ENV{CRAY_HIPFORT_LIB})
-		endif()
+                elseif(CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
+                    target_link_libraries(${a_target} PRIVATE $ENV{CRAY_HIPFORT_LIB})
+                else() 
+                    find_package(hipfort COMPONENTS hipfft CONFIG REQUIRED)
+                    target_link_libraries(${a_target} PRIVATE hipfort::hipfft)                    
+                endif()
             else()
-		find_package(FFTW REQUIRED)
+                find_package(FFTW REQUIRED)
                 target_link_libraries(${a_target} PRIVATE FFTW::FFTW)
             endif()
         endif()

From 8a60c3065eb1f316eaf48c7837d97d6ca01c4061 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 02:04:04 -0500
Subject: [PATCH 12/21] format

---
 CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 46f1e17dc8..c739106134 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -441,9 +441,9 @@ function(MFC_SETUP_TARGET)
 
     foreach (a_target ${IPO_TARGETS})
         set_target_properties(${a_target} PROPERTIES Fortran_PREPROCESS ON)
-	   message(STATUS ${CMAKE_Fortran_COMPILER_ID})
+	    message(STATUS ${CMAKE_Fortran_COMPILER_ID})
 
-       target_include_directories(${a_target} PRIVATE
+        target_include_directories(${a_target} PRIVATE
             "${CMAKE_SOURCE_DIR}/src/common"
             "${CMAKE_SOURCE_DIR}/src/common/include"
             "${CMAKE_SOURCE_DIR}/src/${ARGS_TARGET}")
@@ -603,8 +603,8 @@ function(MFC_SETUP_TARGET)
                         PRIVATE -DFRONTIER_UNIFIED)
                 endif()
 
-		find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
-		target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
+                find_package(hipfort COMPONENTS hip CONFIG REQUIRED)
+                target_link_libraries(${a_target} PRIVATE hipfort::hip hipfort::hipfort-amdgcn flang_rt.hostdevice)
             endif()
         elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Cray")
             target_compile_options(${a_target} PRIVATE "SHELL:-h noacc" "SHELL:-x acc")

From d4f1a40ac6506c09a89505364408aa18143f0818 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 02:22:30 -0500
Subject: [PATCH 13/21] Merge with latest commit and fix typo

---
 src/simulation/m_ibm.fpp       | 7 +++++--
 toolchain/bootstrap/modules.sh | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 8c78db6770..68201aa315 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -1039,8 +1039,11 @@ contains
         real(wp), dimension(1:3, 1:3) :: viscous_stress_div, viscous_stress_div_1, viscous_stress_div_2, viscous_cross_1, viscous_cross_2 ! viscous stress tensor with temp vectors to hold divergence calculations
         real(wp), dimension(1:3) :: local_force_contribution, radial_vector, local_torque_contribution, vel
         real(wp) :: cell_volume, dx, dy, dz, dynamic_viscosity
-        real(wp), dimension(1:num_fluids) :: dynamic_viscosities
-
+#:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        real(wp), dimension(3) :: dynamic_viscosities
+#:else
+        real(wp), dimension(num_fluids) :: dynamic_viscosities
+#:endif
         forces = 0._wp
         torques = 0._wp
 
diff --git a/toolchain/bootstrap/modules.sh b/toolchain/bootstrap/modules.sh
index 5cfea6867d..2185309ada 100644
--- a/toolchain/bootstrap/modules.sh
+++ b/toolchain/bootstrap/modules.sh
@@ -123,7 +123,7 @@ if [ ! -z ${CRAY_LD_LIBRARY_PATH+x} ] && [ "$u_c" '!=' 'c' ] &&  [ "$u_c" '!=' '
     export LD_LIBRARY_PATH="$CRAY_LD_LIBRARY_PATH:$LD_LIBRARY_PATH"
 fi
 
-if[ "$u_c" '==' 'famd' ]; then 
+if [ "$u_c" '==' 'famd' ]; then 
     export OLCF_AFAR_ROOT="/sw/crusher/ums/compilers/afar/rocm-afar-8873-drop-22.2.0"
 
     export PATH=${OLCF_AFAR_ROOT}/lib/llvm/bin:${PATH}

From 37a9484045eef5277c8856350f8cc09c5745dbfd Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 02:26:23 -0500
Subject: [PATCH 14/21] format

---
 src/simulation/m_ibm.fpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/simulation/m_ibm.fpp b/src/simulation/m_ibm.fpp
index 68201aa315..5e5e136629 100644
--- a/src/simulation/m_ibm.fpp
+++ b/src/simulation/m_ibm.fpp
@@ -1039,11 +1039,11 @@ contains
         real(wp), dimension(1:3, 1:3) :: viscous_stress_div, viscous_stress_div_1, viscous_stress_div_2, viscous_cross_1, viscous_cross_2 ! viscous stress tensor with temp vectors to hold divergence calculations
         real(wp), dimension(1:3) :: local_force_contribution, radial_vector, local_torque_contribution, vel
         real(wp) :: cell_volume, dx, dy, dz, dynamic_viscosity
-#:if not MFC_CASE_OPTIMIZATION and USING_AMD
-        real(wp), dimension(3) :: dynamic_viscosities
-#:else
-        real(wp), dimension(num_fluids) :: dynamic_viscosities
-#:endif
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            real(wp), dimension(3) :: dynamic_viscosities
+        #:else
+            real(wp), dimension(num_fluids) :: dynamic_viscosities
+        #:endif
         forces = 0._wp
         torques = 0._wp
 

From 5457d1182022ffb77d7da6d262e272686b3df60d Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 04:20:46 -0500
Subject: [PATCH 15/21] Fix regression on EL case (phoenix)

---
 src/simulation/m_bubbles_EL.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/simulation/m_bubbles_EL.fpp b/src/simulation/m_bubbles_EL.fpp
index b3917c3488..c391e58cc6 100644
--- a/src/simulation/m_bubbles_EL.fpp
+++ b/src/simulation/m_bubbles_EL.fpp
@@ -577,7 +577,7 @@ contains
 
         ! Radial motion model
         adap_dt_stop_max = 0
-        $:GPU_PARALLEL_LOOP(private='[k,i,myalpha_rho,myalpha,myVapFlux,preterm1, term2, paux, pint, Romega, term1_fac,myR_m, mygamma_m, myPb, myMass_n, myMass_v,myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot,myPinf, aux1, aux2, myCson, myRho,gamma,pi_inf,qv,dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu,adap_dt_stop]', &
+        $:GPU_PARALLEL_LOOP(private='[k,i,myalpha_rho,myalpha,Re,cell,myVapFlux,preterm1, term2, paux, pint, Romega, term1_fac,myR_m, mygamma_m, myPb, myMass_n, myMass_v,myR, myV, myBeta_c, myBeta_t, myR0, myPbdot, myMvdot,myPinf, aux1, aux2, myCson, myRho,gamma,pi_inf,qv,dmalf, dmntait, dmBtait, dm_bub_adv_src, dm_divu,adap_dt_stop]', &
             & reduction='[[adap_dt_stop_max]]',reductionOp='[MAX]', &
             & copy='[adap_dt_stop_max]',copyin='[stage]')
         do k = 1, nBubs

From 8ccb1943e5e57a864d697b33be08b11d6a81587e Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 05:59:49 -0500
Subject: [PATCH 16/21] Revert fypp

---
 CMakeLists.txt | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c739106134..da1ec9adb9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -310,17 +310,7 @@ endif()
 # and generate documentation. Instead, we can simply include the list of .f90
 # files that will eventually be used to compile <target>.
 
-macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP)
-
-    if (${useOpenACC} AND ${useOpenMP})
-      message(FATAL_ERROR "OpenACC and OpenMP at same time is unsupported.")
-    elseif (${useOpenACC})
-      set(MFC_GPU_MODE "OpenACC")
-    elseif (${useOpenMP})
-      set(MFC_GPU_MODE "OpenMP")
-    else()
-      set(MFC_GPU_MODE "")
-    endif()
+macro(HANDLE_SOURCES target useCommon)
 
     set(${target}_DIR "${CMAKE_SOURCE_DIR}/src/${target}")
     set(common_DIR    "${CMAKE_SOURCE_DIR}/src/common")
@@ -382,7 +372,6 @@ macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP)
                                  -D MFC_${CMAKE_Fortran_COMPILER_ID}
                                  -D MFC_${${target}_UPPER}
                                  -D MFC_COMPILER="${CMAKE_Fortran_COMPILER_ID}"
-                                 -D MFC_GPU_MODE="${MFC_GPU_MODE}"
 				                 -D MFC_CASE_OPTIMIZATION=False
                                  -D chemistry=False
                                  --line-numbering
@@ -400,10 +389,10 @@ macro(HANDLE_SOURCES target useCommon useOpenACC useOpenMP)
 endmacro()
 
 
-HANDLE_SOURCES(pre_process ON OFF OFF)
-HANDLE_SOURCES(simulation ON ${MFC_OpenACC} ${MFC_OpenMP})
-HANDLE_SOURCES(post_process ON OFF OFF)
-HANDLE_SOURCES(syscheck OFF ${MFC_OpenACC} ${MFC_OpenMP})
+HANDLE_SOURCES(pre_process ON)
+HANDLE_SOURCES(simulation ON)
+HANDLE_SOURCES(post_process ON)
+HANDLE_SOURCES(syscheck OFF)
 
 # MFC_SETUP_TARGET: Given a target (herein <target>), this macro creates a new
 # executable <target> with the appropriate sources, compiler definitions, and

From a70b8cd1ac68fc6166b34db27f21b888466dd550 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 06:18:14 -0500
Subject: [PATCH 17/21] add amd to file filter

---
 .github/file-filter.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/file-filter.yml b/.github/file-filter.yml
index 6c8fb7216b..a2910c89af 100644
--- a/.github/file-filter.yml
+++ b/.github/file-filter.yml
@@ -24,6 +24,7 @@ scripts: &scripts
 yml: &yml
   - '.github/workflows/phoenix/**'
   - '.github/workflows/frontier/**'
+  - '.github/workflows/frontier_amd/**'
   - '.github/workflows/bench.yml'
   - '.github/workflows/test.yml'
   - '.github/workflows/formatting.yml'

From 146c2a5e17277fc280034c78f92a46128db52dfb Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 09:46:42 -0500
Subject: [PATCH 18/21] Checkers for AMD compiler + fixing a typo

---
 .github/workflows/bench.yml      |  4 ++--
 .github/workflows/test.yml       | 23 +++++++++++++++--------
 src/common/m_boundary_common.fpp |  2 +-
 src/common/m_checker_common.fpp  | 14 ++++++++++++++
 src/simulation/m_start_up.fpp    |  5 +++++
 5 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
index 0653883140..1427f9d693 100644
--- a/.github/workflows/bench.yml
+++ b/.github/workflows/bench.yml
@@ -70,10 +70,10 @@ jobs:
             device: gpu
             interface: omp
             build_script: "bash .github/workflows/frontier/build.sh gpu omp bench"
-          - cluster: frontier
+          - cluster: frontier_amd
             name: Oak Ridge | Frontier (AMD)
             group: phoenix
-            labels: frontier_amd
+            labels: frontier
             flag: famd
             device: gpu
             interface: omp
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b7a6a6b9e9..bf0cd8e247 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -103,31 +103,38 @@ jobs:
         include:
           # Phoenix (GT)
           - lbl: 'gt'
+            lbl_comp: 'gt'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'acc'
           - lbl: 'gt'
+            lbl_comp: 'gt'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'gpu'
             interface: 'omp'
           - lbl: 'gt'
+            lbl_comp: 'gt'
             cluster_name: 'Georgia Tech | Phoenix'
             device: 'cpu'
             interface: 'none'
           # Frontier (ORNL)
           - lbl: 'frontier'
+            lbl_comp: 'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'acc'
           - lbl: 'frontier'
+            lbl_comp: 'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'gpu'
             interface: 'omp'
           - lbl: 'frontier'
+            lbl_comp: 'frontier'
             cluster_name: 'Oak Ridge | Frontier'
             device: 'cpu'
             interface: 'none'
-          - lbl: 'frontier_amd'
+          - lbl: 'frontier'
+            lbl_comp: 'frontier_amd'
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
@@ -143,23 +150,23 @@ jobs:
         uses: actions/checkout@v4
 
       - name: Build & Test
-        if:   matrix.lbl == 'gt'
+        if:   matrix.lbl_comp == 'gt'
         run:  bash .github/workflows/phoenix/submit.sh .github/workflows/phoenix/test.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Build
-        if:   matrix.lbl == 'frontier'
+        if:   matrix.lbl_comp == 'frontier'
         run:  bash .github/workflows/frontier/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test
-        if:   matrix.lbl == 'frontier'
+        if:   matrix.lbl_comp == 'frontier'
         run:  bash .github/workflows/frontier/submit.sh .github/workflows/frontier/test.sh ${{matrix.device}} ${{ matrix.interface }}
 
       - name: Build
-        if:   matrix.lbl == 'frontier_amd'
+        if:   matrix.lbl_comp == 'frontier_amd'
         run:  bash .github/workflows/frontier_amd/build.sh ${{ matrix.device }} ${{ matrix.interface }}
 
       - name: Test
-        if:   matrix.lbl == 'frontier_amd'
+        if:   matrix.lbl_comp == 'frontier_amd'
         run:  bash .github/workflows/frontier_amd/submit.sh .github/workflows/frontier_amd/test.sh ${{matrix.device}} ${{ matrix.interface }}  
 
       - name: Print Logs
@@ -168,14 +175,14 @@ jobs:
 
       - name: Archive Logs
         uses: actions/upload-artifact@v4
-        if:   matrix.lbl == 'frontier'
+        if:   matrix.lbl_comp == 'frontier'
         with:
           name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }}
           path: test-${{ matrix.device }}-${{ matrix.interface }}.out
 
       - name: Archive Logs
         uses: actions/upload-artifact@v4
-        if:   matrix.lbl == 'frontier_amd'
+        if:   matrix.lbl_comp == 'frontier_amd'
         with:
           name: logs-${{ strategy.job-index }}-${{ matrix.device }}-${{ matrix.interface }}
           path: test-${{ matrix.device }}-${{ matrix.interface }}.out
diff --git a/src/common/m_boundary_common.fpp b/src/common/m_boundary_common.fpp
index a0c4ed2455..4b5f660bd6 100644
--- a/src/common/m_boundary_common.fpp
+++ b/src/common/m_boundary_common.fpp
@@ -1558,7 +1558,7 @@ contains
 
         end if
 
-        #:if not MFC_CASE_OPTIMIZATION or num_dims > 2
+        #:if not MFC_CASE_OPTIMIZATION or num_dims > 1
 
             if (n == 0) then
                 return
diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index 4d5b7d80cf..495eada91f 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -2,6 +2,7 @@
 !!@file m_checker_common.f90
 !!@brief Contains module m_checker_common
 
+#:include 'case.fpp'
 #:include 'macros.fpp'
 
 !> @brief The purpose of the module is to check for compatible input files for.
@@ -29,6 +30,9 @@ contains
 #ifndef MFC_SIMULATION
         call s_check_total_cells
 #endif
+        #:if USING_AMD
+            call s_check_amd
+        #:endif
 
     end subroutine s_check_inputs_common
 
@@ -46,6 +50,16 @@ contains
             "which is currently "//trim(numStr))
     end subroutine s_check_total_cells
 
+    impure subroutine s_check_amd
+
+        #:if not MFC_CASE_OPTIMIZATION 
+            @:PROHIBIT(num_fluids > 3, "num_fluids <= 3 for AMDFLang when Case optimization is off")
+            @:PROHIBIT((bubbles_euler .or. bubbles_lagrange) .and. nb > 3, "nb <= 3 for AMDFLang when Case optimization is off")
+            @:PROHIBIT(chemistry .and. num_species /= 10, "num_species = 10 for AMDFLang when Case optimization is off")
+        #:endif
+
+    end subroutine s_check_amd
+
 #endif
 
 #ifndef MFC_POST_PROCESS
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index d46a3e0793..5e9da9c292 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1301,6 +1301,11 @@ contains
         real(wp) :: temp1, temp2, temp3, temp4
 
         call s_initialize_global_parameters_module()
+        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+            #:for BC in {-5, -6, -7, -8, -9, -10, -11, -12, -13}
+                @:PROHIBIT(any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == ${BC}$) .and. sys_size > 12, "CBC module with AMD compiler requires sys_size <= 12 when case optimization is turned off")
+            #:endfor
+        #:endif
         if (bubbles_euler .or. bubbles_lagrange) then
             call s_initialize_bubbles_model()
         end if

From bce4bf6c59d6ed7fafee6cffe21b119b87e884f6 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 09:47:30 -0500
Subject: [PATCH 19/21] format

---
 src/common/m_checker_common.fpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index 495eada91f..d98b7592d0 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -52,7 +52,7 @@ contains
 
     impure subroutine s_check_amd
 
-        #:if not MFC_CASE_OPTIMIZATION 
+        #:if not MFC_CASE_OPTIMIZATION
             @:PROHIBIT(num_fluids > 3, "num_fluids <= 3 for AMDFLang when Case optimization is off")
             @:PROHIBIT((bubbles_euler .or. bubbles_lagrange) .and. nb > 3, "nb <= 3 for AMDFLang when Case optimization is off")
             @:PROHIBIT(chemistry .and. num_species /= 10, "num_species = 10 for AMDFLang when Case optimization is off")

From c4af75bc11e225f0feec7f2ead935b74584705f2 Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 10:05:50 -0500
Subject: [PATCH 20/21] fixing typos

---
 src/common/m_checker_common.fpp | 4 ++--
 src/simulation/m_start_up.fpp   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/common/m_checker_common.fpp b/src/common/m_checker_common.fpp
index d98b7592d0..ff4af7bd20 100644
--- a/src/common/m_checker_common.fpp
+++ b/src/common/m_checker_common.fpp
@@ -50,6 +50,8 @@ contains
             "which is currently "//trim(numStr))
     end subroutine s_check_total_cells
 
+#endif
+
     impure subroutine s_check_amd
 
         #:if not MFC_CASE_OPTIMIZATION
@@ -60,8 +62,6 @@ contains
 
     end subroutine s_check_amd
 
-#endif
-
 #ifndef MFC_POST_PROCESS
 
 #endif
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 5e9da9c292..3e690ec3da 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1301,7 +1301,7 @@ contains
         real(wp) :: temp1, temp2, temp3, temp4
 
         call s_initialize_global_parameters_module()
-        #:if not MFC_CASE_OPTIMIZATION and USING_AMD
+        #:if USING_AMD
             #:for BC in {-5, -6, -7, -8, -9, -10, -11, -12, -13}
                 @:PROHIBIT(any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == ${BC}$) .and. sys_size > 12, "CBC module with AMD compiler requires sys_size <= 12 when case optimization is turned off")
             #:endfor

From bd28799067f37d20c941075d5a8d1dcb2346da5c Mon Sep 17 00:00:00 2001
From: Anand <anand@Anands-MacBook-Pro.local>
Date: Thu, 29 Jan 2026 12:21:31 -0500
Subject: [PATCH 21/21] fix bounds in CBC and add cpu test suite for amdflang

---
 .github/workflows/test.yml               |  5 +++++
 src/post_process/m_global_parameters.fpp |  2 +-
 src/pre_process/m_global_parameters.fpp  |  2 +-
 src/simulation/m_cbc.fpp                 |  2 +-
 src/simulation/m_compute_cbc.fpp         | 24 ++++++++++++------------
 src/simulation/m_global_parameters.fpp   |  2 +-
 src/simulation/m_start_up.fpp            |  3 ++-
 7 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bf0cd8e247..f0dc72783d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -138,6 +138,11 @@ jobs:
             cluster_name: 'Oak Ridge | Frontier (AMD)'
             device: 'gpu'
             interface: 'omp'
+          - lbl: 'frontier'
+            lbl_comp: 'frontier_amd'
+            cluster_name: 'Oak Ridge | Frontier (AMD)'
+            device: 'cpu'
+            interface: 'none'
     runs-on:
       group:  phoenix
       labels: ${{ matrix.lbl }}
diff --git a/src/post_process/m_global_parameters.fpp b/src/post_process/m_global_parameters.fpp
index e4585d011a..3d0ddcef6d 100644
--- a/src/post_process/m_global_parameters.fpp
+++ b/src/post_process/m_global_parameters.fpp
@@ -245,7 +245,7 @@ module m_global_parameters
     logical :: E_wrt
     logical, dimension(num_fluids_max) :: alpha_rho_e_wrt
     logical :: fft_wrt
-    logical :: dummy
+    logical :: dummy   !< AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional is false
     logical :: pres_wrt
     logical, dimension(num_fluids_max) :: alpha_wrt
     logical :: gamma_wrt
diff --git a/src/pre_process/m_global_parameters.fpp b/src/pre_process/m_global_parameters.fpp
index c867de7562..933bfb6f5b 100644
--- a/src/pre_process/m_global_parameters.fpp
+++ b/src/pre_process/m_global_parameters.fpp
@@ -300,7 +300,7 @@ module m_global_parameters
     !! to the next time-step.
 
     logical :: fft_wrt
-    logical :: dummy
+    logical :: dummy  !< AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional is false
 
 contains
 
diff --git a/src/simulation/m_cbc.fpp b/src/simulation/m_cbc.fpp
index 4dc7d936fb..012d2358f2 100644
--- a/src/simulation/m_cbc.fpp
+++ b/src/simulation/m_cbc.fpp
@@ -651,7 +651,7 @@ contains
         real(wp) :: dqv_dt
         real(wp) :: dpres_ds
         #:if USING_AMD
-            real(wp), dimension(12) :: L
+            real(wp), dimension(20) :: L
         #:else
             real(wp), dimension(sys_size) :: L
         #:endif
diff --git a/src/simulation/m_compute_cbc.fpp b/src/simulation/m_compute_cbc.fpp
index 8c6a1d9fca..13bd6a8209 100644
--- a/src/simulation/m_compute_cbc.fpp
+++ b/src/simulation/m_compute_cbc.fpp
@@ -36,7 +36,7 @@ contains
     subroutine s_fill_density_L(L, lambda_factor, lambda2, c, mf, dalpha_rho_ds, dpres_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -59,7 +59,7 @@ contains
     subroutine s_fill_velocity_L(L, lambda_factor, lambda2, dvel_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -81,7 +81,7 @@ contains
     subroutine s_fill_advection_L(L, lambda_factor, lambda2, dadv_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -103,7 +103,7 @@ contains
     subroutine s_fill_chemistry_L(L, lambda_factor, lambda2, dYs_ds)
         $:GPU_ROUTINE(parallelism='[seq]')
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -130,7 +130,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -154,7 +154,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -194,7 +194,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -217,7 +217,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -250,7 +250,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -280,7 +280,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -308,7 +308,7 @@ contains
         $:GPU_ROUTINE(function_name='s_compute_supersonic_inflow_L', &
             & parallelism='[seq]', cray_inline=True)
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
@@ -323,7 +323,7 @@ contains
 
         real(wp), dimension(3), intent(in) :: lambda
         #:if USING_AMD
-            real(wp), dimension(12), intent(inout) :: L
+            real(wp), dimension(20), intent(inout) :: L
         #:else
             real(wp), dimension(sys_size), intent(inout) :: L
         #:endif
diff --git a/src/simulation/m_global_parameters.fpp b/src/simulation/m_global_parameters.fpp
index 122133f844..1b1738a044 100644
--- a/src/simulation/m_global_parameters.fpp
+++ b/src/simulation/m_global_parameters.fpp
@@ -543,7 +543,7 @@ module m_global_parameters
     $:GPU_DECLARE(create='[Bx0,powell]')
 
     logical :: fft_wrt
-    logical :: dummy
+    logical :: dummy  !< AMDFlang workaround: keep a dummy logical to avoid a compiler case-optimization bug when a parameter+GPU-kernel conditional is false
 
     !> @name Continuum damage model parameters
     !> @{!
diff --git a/src/simulation/m_start_up.fpp b/src/simulation/m_start_up.fpp
index 3e690ec3da..b60ebcd65d 100644
--- a/src/simulation/m_start_up.fpp
+++ b/src/simulation/m_start_up.fpp
@@ -1303,7 +1303,8 @@ contains
         call s_initialize_global_parameters_module()
         #:if USING_AMD
             #:for BC in {-5, -6, -7, -8, -9, -10, -11, -12, -13}
-                @:PROHIBIT(any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == ${BC}$) .and. sys_size > 12, "CBC module with AMD compiler requires sys_size <= 12 when case optimization is turned off")
+                @:PROHIBIT(any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == ${BC}$) .and. adv_idx%end > 20 .and. (.not. chemistry), "CBC module with AMD compiler requires adv_idx%end <= 20 when case optimization is turned off")
+                @:PROHIBIT(any((/bc_x%beg, bc_x%end, bc_y%beg, bc_y%end, bc_z%beg, bc_z%end/) == ${BC}$) .and. sys_size > 20 .and. (chemistry), "CBC module with AMD compiler and chemistry requires sys_size <= 20 when case optimization is turned off")
             #:endfor
         #:endif
         if (bubbles_euler .or. bubbles_lagrange) then