diff --git a/CMakeLists.txt b/CMakeLists.txt index 5622d94105..5e71f601cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,6 +23,7 @@ option(MFC_OpenMP "Build with OpenMP" OFF option(MFC_GCov "Build with GCov" OFF) option(MFC_Unified "Build with unified CPU & GPU memory (GH-200 only)" OFF) option(MFC_Fastmath "Build with -gpu=fastmath on NV GPUs" OFF) +option(MFC_TRACE_CALLS "Build with generated runtime call tracing" OFF) option(MFC_PRE_PROCESS "Build pre_process" OFF) option(MFC_SIMULATION "Build simulation" OFF) option(MFC_POST_PROCESS "Build post_process" OFF) @@ -108,6 +109,17 @@ endif() find_program(FYPP_EXE fypp REQUIRED) +set(MFC_TRACE_SUPPORTED_COMPILERS GNU Intel IntelLLVM NVHPC PGI) + +if (MFC_TRACE_CALLS) + if (NOT CMAKE_Fortran_COMPILER_ID IN_LIST MFC_TRACE_SUPPORTED_COMPILERS) + message(FATAL_ERROR "ERROR: Runtime call tracing is supported with GNU, Intel/IntelLLVM, and NVHPC/PGI Fortran compilers. CMake detected ${CMAKE_Fortran_COMPILER_ID}.") + endif() + + find_package(Python3 REQUIRED COMPONENTS Interpreter) + find_package(Threads REQUIRED) +endif() + # Miscellaneous Configuration: # * Explicitly link to -ldl (or system equivalent) @@ -133,6 +145,16 @@ if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") $<$:-ffree-line-length-none> ) + if (MFC_TRACE_CALLS) + add_compile_options( + $<$:-finstrument-functions> + $<$:-g> + $<$:-fno-inline> + $<$:-fno-optimize-sibling-calls> + ) + add_link_options("-rdynamic") + endif() + if (MFC_GCov) # Warning present in gcc versions >= 12 that is treated as an error @@ -241,7 +263,7 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Flang") if (CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelDebug") add_compile_options($<$:-O1> $<$:-g>) endif() -elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") +elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") OR (CMAKE_Fortran_COMPILER_ID STREQUAL "IntelLLVM")) add_compile_options($<$:-free>) if (CMAKE_BUILD_TYPE STREQUAL "Debug") @@ -249,6 +271,15 @@ elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "Intel") elseif (CMAKE_BUILD_TYPE STREQUAL "RelDebug") add_compile_options(-g -Og -traceback -check bounds) endif() + + if (MFC_TRACE_CALLS) + add_compile_options( + $<$:-finstrument-functions> + $<$:-g> + $<$:-O0> + ) + add_link_options("-Wl,-export-dynamic") + endif() elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")) add_compile_options( $<$:-Mfreeform> @@ -275,12 +306,28 @@ elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_ ) endif() + if (MFC_TRACE_CALLS) + add_compile_options( + $<$:-Minstrument=functions> + $<$:-g> + ) + add_link_options("-Wl,-export-dynamic") + endif() + if (DEFINED ENV{MFC_CUDA_CC}) string(REGEX MATCHALL "[0-9]+" MFC_CUDA_CC $ENV{MFC_CUDA_CC}) message(STATUS "Found $MFC_CUDA_CC specified. GPU code will be generated for compute capability(ies) ${MFC_CUDA_CC}.") endif() endif() +if (MFC_TRACE_CALLS) + if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + set_source_files_properties("${CMAKE_SOURCE_DIR}/src/common/m_trace.f90" PROPERTIES COMPILE_OPTIONS "-fno-instrument-functions") + elseif ((CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") OR (CMAKE_Fortran_COMPILER_ID STREQUAL "PGI")) + set_source_files_properties("${CMAKE_SOURCE_DIR}/src/common/m_trace.f90" PROPERTIES COMPILE_OPTIONS "-Minstrument-exclude-func-list=s_trace_point_begin;-Minstrument-exclude-func-list=s_trace_point_end") + endif() +endif() + if (CMAKE_BUILD_TYPE STREQUAL "Release") # Processor tuning: Check if we can target the host's native CPU's ISA. # Skip for gcov builds — -march=native on newer CPUs (e.g. Granite Rapids) @@ -306,7 +353,9 @@ if (CMAKE_BUILD_TYPE STREQUAL "Release") # Enable LTO/IPO if supported (skip for gcov — LTO interferes with coverage # instrumentation and can trigger assembler errors on newer architectures). - if (MFC_GCov) + if (MFC_TRACE_CALLS) + message(STATUS "LTO/IPO disabled for trace build") + elseif (MFC_GCov) message(STATUS "LTO/IPO disabled for gcov build") elseif (CMAKE_Fortran_COMPILER_ID STREQUAL "NVHPC") if (MFC_Unified) @@ -396,6 +445,7 @@ macro(HANDLE_SOURCES target useCommon) set(${target}_SRCs ${${target}_F90s}) if (${useCommon}) file(GLOB common_F90s CONFIGURE_DEPENDS "${common_DIR}/*.f90") + list(FILTER common_F90s EXCLUDE REGEX ".*/m_trace\.f90$") list(APPEND ${target}_SRCs ${common_F90s}) endif() @@ -458,6 +508,48 @@ macro(HANDLE_SOURCES target useCommon) list(APPEND ${target}_SRCs ${f90}) endforeach() + + if (MFC_TRACE_CALLS) + list(FIND ${target}_SRCs "${common_DIR}/m_trace.f90" _trace_module_index) + if (_trace_module_index EQUAL -1) + list(APPEND ${target}_SRCs "${common_DIR}/m_trace.f90") + endif() + list(APPEND ${target}_SRCs "${common_DIR}/m_trace_runtime.c") + + set(_trace_SRCs "") + set(_trace_instrument_options "") + if (MFC_OpenACC OR MFC_OpenMP) + list(APPEND _trace_instrument_options "--no-point-traces") + endif() + file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/trace/${target}") + + foreach(src ${${target}_SRCs}) + cmake_path(GET src FILENAME src_filename) + + if(src_filename STREQUAL "m_trace.f90" OR src_filename STREQUAL "m_trace_runtime.c" OR src_filename STREQUAL "m_thermochem.f90") + list(APPEND _trace_SRCs "${src}") + else() + set(trace_src "${CMAKE_BINARY_DIR}/trace/${target}/${src_filename}") + + add_custom_command( + OUTPUT "${trace_src}" + COMMAND "${Python3_EXECUTABLE}" + "${CMAKE_SOURCE_DIR}/toolchain/instrument_fortran_trace.py" + ${_trace_instrument_options} + "${src}" + "${trace_src}" + DEPENDS "${src}" + "${CMAKE_SOURCE_DIR}/toolchain/instrument_fortran_trace.py" + COMMENT "Instrumenting (trace) ${src_filename}" + VERBATIM + ) + + list(APPEND _trace_SRCs "${trace_src}") + endif() + endforeach() + + set(${target}_SRCs ${_trace_SRCs}) + endif() endmacro() @@ -559,6 +651,10 @@ exit 0 MFC_${${ARGS_TARGET}_UPPER} ) + if (MFC_TRACE_CALLS) + target_link_libraries(${a_target} PRIVATE Threads::Threads) + endif() + if (MFC_MPI AND ARGS_MPI) find_package(MPI COMPONENTS Fortran REQUIRED) @@ -1039,4 +1135,4 @@ site_name(SITE_NAME) configure_file( "${CMAKE_CURRENT_SOURCE_DIR}/toolchain/cmake/configuration.cmake.in" - "${CMAKE_CURRENT_BINARY_DIR}/configuration.txt") \ No newline at end of file + "${CMAKE_CURRENT_BINARY_DIR}/configuration.txt") diff --git a/docs/module_categories.json b/docs/module_categories.json index ef4a7d726e..69c3fbf4fd 100644 --- a/docs/module_categories.json +++ b/docs/module_categories.json @@ -71,7 +71,8 @@ "m_checker", "m_checker_common", "m_sim_helpers", - "m_derived_variables" + "m_derived_variables", + "m_trace" ] }, { diff --git a/src/common/m_trace.f90 b/src/common/m_trace.f90 new file mode 100644 index 0000000000..4777513638 --- /dev/null +++ b/src/common/m_trace.f90 @@ -0,0 +1,122 @@ +!> +!! @file +!! @brief Contains module m_trace + +!> @brief Simple runtime call tracing helpers. +module m_trace + + implicit none + + private + public :: s_trace_point_begin, s_trace_point_end + + logical, private :: trace_initialized = .false. + logical, private :: trace_enabled = .false. + logical, private :: trace_point_enabled = .false. + logical, private :: trace_point_middle = .false. + integer, private :: trace_j = 0 + integer, private :: trace_k = 0 + integer, private :: trace_l = 0 + integer, private :: trace_point_depth = 0 + + interface + subroutine c_trace_point_begin() bind(C, name="mfc_trace_point_begin") + + end subroutine c_trace_point_begin + + subroutine c_trace_point_end() bind(C, name="mfc_trace_point_end") + + end subroutine c_trace_point_end + end interface + +contains + + !> Initialize trace settings from environment variables. + subroutine s_initialize_trace + + character(len=64) :: env_value + integer :: env_len + integer :: first_comma + integer :: second_comma + integer :: read_status + + if (trace_initialized) return + + call get_environment_variable('MFC_TRACE', env_value, length=env_len) + trace_enabled = env_len > 0 .and. trim(env_value) /= '0' + + call get_environment_variable('MFC_TRACE_POINT', env_value, length=env_len) + if (env_len > 0) then + if (trim(env_value(:env_len)) == 'middle') then + trace_point_enabled = .true. + trace_point_middle = .true. + else + first_comma = index(env_value, ',') + second_comma = 0 + if (first_comma > 0) second_comma = index(env_value(first_comma + 1:), ',') + if (second_comma > 0) second_comma = second_comma + first_comma + + if (first_comma > 0 .and. second_comma > first_comma) then + read (env_value(:first_comma - 1), *, iostat=read_status) trace_j + trace_point_enabled = read_status == 0 + read (env_value(first_comma + 1:second_comma - 1), *, iostat=read_status) trace_k + trace_point_enabled = trace_point_enabled .and. read_status == 0 + read (env_value(second_comma + 1:env_len), *, iostat=read_status) trace_l + trace_point_enabled = trace_point_enabled .and. read_status == 0 + end if + end if + end if + + trace_initialized = .true. + + end subroutine s_initialize_trace + + !> Enable nested routine-entry tracing while a call at MFC_TRACE_POINT executes. + subroutine s_trace_point_begin(j, k, l, mid_j, mid_k, mid_l) + + integer, intent(in) :: j + integer, intent(in) :: k + integer, intent(in) :: l + integer, intent(in) :: mid_j + integer, intent(in) :: mid_k + integer, intent(in) :: mid_l + integer :: target_j + integer :: target_k + integer :: target_l + + call s_initialize_trace() + + if (.not. trace_enabled) return + if (.not. trace_point_enabled) return + if (trace_point_middle) then + target_j = mid_j + target_k = mid_k + target_l = mid_l + else + target_j = trace_j + target_k = trace_k + target_l = trace_l + end if + + if (j /= target_j .or. k /= target_k .or. l /= target_l) return + + trace_point_depth = trace_point_depth + 1 + call c_trace_point_begin() + + end subroutine s_trace_point_begin + + !> Disable nested point tracing after a grid-point call returns. + subroutine s_trace_point_end + + call s_initialize_trace() + + if (.not. trace_enabled) return + if (.not. trace_point_enabled) return + if (trace_point_depth > 0) then + call c_trace_point_end() + trace_point_depth = trace_point_depth - 1 + end if + + end subroutine s_trace_point_end + +end module m_trace diff --git a/src/common/m_trace_runtime.c b/src/common/m_trace_runtime.c new file mode 100644 index 0000000000..8a7d0a8670 --- /dev/null +++ b/src/common/m_trace_runtime.c @@ -0,0 +1,712 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MFC_TRACE_STACK_MAX 4096 +#define MFC_TRACE_NAME_MAX 512 +#define MFC_TRACE_SYMBOL_CACHE_MAX 512 +#define MFC_TRACE_SETUP_MAX 8192 +#define MFC_TRACE_CONTEXT_MAX 16384 +#define MFC_TRACE_COLOR_RESET "\033[0m" +#define MFC_TRACE_COLOR_BOLD "\033[1m" +#define MFC_TRACE_COLOR_DIM "\033[2m" +#define MFC_TRACE_COLOR_TRACE "\033[32m" +#define MFC_TRACE_COLOR_CONTEXT "\033[36m" +#define MFC_TRACE_COLOR_MPI "\033[35m" + +typedef struct { + void *addr; + char name[MFC_TRACE_NAME_MAX]; +} mfc_trace_symbol_cache_entry; + +typedef struct { + void *addr; + void *call_site; + uint64_t self_id; + uint64_t parent_id; + int depth; + int emitted; +} mfc_trace_setup_entry; + +typedef struct { + void *addr; + void *call_site; +} mfc_trace_context_entry; + +static __thread void *trace_stack[MFC_TRACE_STACK_MAX]; +static __thread void *trace_stack_call_sites[MFC_TRACE_STACK_MAX]; +static __thread uint64_t trace_stack_ids[MFC_TRACE_STACK_MAX]; +static __thread uint64_t trace_last_dump_ids[MFC_TRACE_STACK_MAX]; +static __thread mfc_trace_symbol_cache_entry trace_symbol_cache[MFC_TRACE_SYMBOL_CACHE_MAX]; +static __thread mfc_trace_setup_entry trace_setup[MFC_TRACE_SETUP_MAX]; +static __thread mfc_trace_context_entry trace_context[MFC_TRACE_CONTEXT_MAX]; +static __thread mfc_trace_context_entry trace_mpi_context[MFC_TRACE_CONTEXT_MAX]; +static __thread int trace_stack_depth = 0; +static __thread int trace_last_dump_depth = -1; +static __thread int trace_point_depth = 0; +static __thread int trace_in_hook = 0; +static __thread int trace_symbol_cache_count = 0; +static __thread int trace_setup_count = 0; +static __thread int trace_context_count = 0; +static __thread int trace_mpi_context_count = 0; +static __thread uint64_t trace_next_stack_id = 1; +static pthread_once_t trace_initialize_once = PTHREAD_ONCE_INIT; +static int trace_enabled = 0; +static int trace_point_enabled = 0; +static int trace_include_setup = 0; +static int trace_context_enabled = 0; +static int trace_mpi_enabled = 0; +static int trace_tree_format = 0; +static int trace_color_enabled = 0; +static int trace_stdout_enabled = 0; +static int trace_file_fd = -1; + +static void mfc_trace_initialize_once(void) __attribute__((no_instrument_function)); +static void mfc_trace_initialize(void) __attribute__((no_instrument_function)); +static int mfc_trace_process_rank(void) __attribute__((no_instrument_function)); +static void mfc_trace_executable_name(char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static void mfc_trace_format_function_name(const char *name, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static void mfc_trace_pretty_file(const char *file, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static void mfc_trace_shell_quote(const char *text, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static int mfc_trace_symbol_from_addr2line(void *addr, const char *image, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static int mfc_trace_call_site_from_addr2line(void *addr, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static void mfc_trace_symbol(void *addr, char *buffer, size_t buffer_size) __attribute__((no_instrument_function)); +static int mfc_trace_skip_symbol(const char *name) __attribute__((no_instrument_function)); +static int mfc_trace_is_mpi_symbol(const char *name) __attribute__((no_instrument_function)); +static const char *mfc_trace_prefix_color(const char *prefix) __attribute__((no_instrument_function)); +static int mfc_trace_format_line(char *line, size_t line_size, int color, const char *prefix, const char *name, const char *caller, int depth) + __attribute__((no_instrument_function)); +static void mfc_trace_write_at_depth(const char *name, void *call_site, int depth) __attribute__((no_instrument_function)); +static void mfc_trace_write_line(const char *prefix, const char *name, void *call_site, int depth) __attribute__((no_instrument_function)); +static void mfc_trace_remember_setup_call(void *addr, void *call_site) __attribute__((no_instrument_function)); +static int mfc_trace_remember_context_call(void *addr, void *call_site) __attribute__((no_instrument_function)); +static void mfc_trace_write_context_call(void *addr, void *call_site) __attribute__((no_instrument_function)); +static int mfc_trace_remember_mpi_call(void *addr, void *call_site) __attribute__((no_instrument_function)); +static void mfc_trace_write_mpi_call(void *addr, void *call_site) __attribute__((no_instrument_function)); +static int mfc_trace_stack_index_for_id(uint64_t id) __attribute__((no_instrument_function)); +static void mfc_trace_flush_setup_for_parent(uint64_t parent_id) __attribute__((no_instrument_function)); +static int mfc_trace_common_dumped_prefix(void) __attribute__((no_instrument_function)); +static void mfc_trace_remember_dumped_stack(void) __attribute__((no_instrument_function)); +static void mfc_trace_dump_stack(void) __attribute__((no_instrument_function)); + +static void mfc_trace_initialize_once(void) { + const char *trace_env; + const char *point_env; + const char *setup_env; + const char *context_env; + const char *mpi_env; + const char *format_env; + const char *color_env; + const char *term_env; + const char *file_env; + const char *stdout_env; + + trace_env = getenv("MFC_TRACE"); + trace_enabled = trace_env != NULL && trace_env[0] != '\0' && strcmp(trace_env, "0") != 0; + if (mfc_trace_process_rank() != 0) trace_enabled = 0; + + point_env = getenv("MFC_TRACE_POINT"); + trace_point_enabled = point_env != NULL && point_env[0] != '\0'; + + setup_env = getenv("MFC_TRACE_INCLUDE_SETUP"); + trace_include_setup = setup_env != NULL && setup_env[0] != '\0' && strcmp(setup_env, "0") != 0; + + context_env = getenv("MFC_TRACE_CONTEXT"); + trace_context_enabled = context_env != NULL && context_env[0] != '\0' && strcmp(context_env, "0") != 0; + + mpi_env = getenv("MFC_TRACE_MPI"); + trace_mpi_enabled = mpi_env != NULL && mpi_env[0] != '\0' && strcmp(mpi_env, "0") != 0; + + format_env = getenv("MFC_TRACE_FORMAT"); + trace_tree_format = format_env != NULL && (strcmp(format_env, "tree") == 0 || strcmp(format_env, "structured") == 0); + + color_env = getenv("MFC_TRACE_COLOR"); + term_env = getenv("TERM"); + stdout_env = getenv("MFC_TRACE_STDOUT"); + if (stdout_env != NULL && stdout_env[0] != '\0') { + trace_stdout_enabled = strcmp(stdout_env, "0") != 0 && strcmp(stdout_env, "false") != 0 && strcmp(stdout_env, "never") != 0; + } else { + trace_stdout_enabled = isatty(STDOUT_FILENO); + } + if (color_env != NULL && (strcmp(color_env, "always") == 0 || strcmp(color_env, "1") == 0 || strcmp(color_env, "true") == 0)) { + trace_color_enabled = trace_stdout_enabled; + } else if (color_env != NULL && (strcmp(color_env, "never") == 0 || strcmp(color_env, "0") == 0 || strcmp(color_env, "false") == 0)) { + trace_color_enabled = 0; + } else { + trace_color_enabled = trace_stdout_enabled && getenv("NO_COLOR") == NULL && term_env != NULL && strcmp(term_env, "dumb") != 0; + } + + file_env = getenv("MFC_TRACE_FILE"); + if (trace_enabled && file_env != NULL && file_env[0] != '\0') { + trace_file_fd = open(file_env, O_WRONLY | O_CREAT | O_APPEND, 0644); + if (trace_file_fd < 0) { + static const char message[] = "MFC trace: failed to open MFC_TRACE_FILE\n"; + (void)write(STDERR_FILENO, message, sizeof(message) - 1); + } + } + + if (trace_enabled) { + char executable[MFC_TRACE_NAME_MAX]; + mfc_trace_executable_name(executable, sizeof(executable)); + mfc_trace_write_line("TRACE_RUN", executable, NULL, 0); + } +} + +static void mfc_trace_initialize(void) { + (void)pthread_once(&trace_initialize_once, mfc_trace_initialize_once); +} + +static int mfc_trace_process_rank(void) { + const char *rank_env_names[] = { + "OMPI_COMM_WORLD_RANK", + "PMI_RANK", + "PMIX_RANK", + "MV2_COMM_WORLD_RANK", + "SLURM_PROCID", + NULL, + }; + const char *value; + char *end; + long rank; + int i; + + for (i = 0; rank_env_names[i] != NULL; ++i) { + value = getenv(rank_env_names[i]); + if (value == NULL || value[0] == '\0') continue; + + rank = strtol(value, &end, 10); + if (end != value) return (int)rank; + } + + return 0; +} + +static void mfc_trace_executable_name(char *buffer, size_t buffer_size) { + char path[1024]; + const char *name; + ssize_t len; + + if (buffer_size == 0) return; + + len = readlink("/proc/self/exe", path, sizeof(path) - 1); + if (len > 0) { + path[len] = '\0'; + name = strrchr(path, '/'); + snprintf(buffer, buffer_size, "%s", name != NULL ? name + 1 : path); + return; + } + + snprintf(buffer, buffer_size, "program"); +} + +static void mfc_trace_format_function_name(const char *name, char *buffer, size_t buffer_size) { + const char *mod_marker; + const char *mp_marker; + const char *mod_start; + const char *proc_start; + const char *suffix; + size_t mod_len; + size_t proc_len; + + if (name == NULL || name[0] == '\0') { + snprintf(buffer, buffer_size, "??"); + return; + } + + mod_marker = strstr(name, "_MOD_"); + if (strncmp(name, "__", 2) == 0 && mod_marker != NULL) { + mod_start = name + 2; + proc_start = mod_marker + 5; + mod_len = (size_t)(mod_marker - mod_start); + proc_len = strlen(proc_start); + snprintf(buffer, buffer_size, "%.*s:%.*s", (int)mod_len, mod_start, (int)proc_len, proc_start); + return; + } + + mp_marker = strstr(name, "_mp_"); + if (mp_marker != NULL) { + mod_start = name; + proc_start = mp_marker + 4; + mod_len = (size_t)(mp_marker - mod_start); + proc_len = strlen(proc_start); + if (proc_len > 0 && proc_start[proc_len - 1] == '_') proc_len -= 1; + snprintf(buffer, buffer_size, "%.*s_%.*s", (int)mod_len, mod_start, (int)proc_len, proc_start); + return; + } + + suffix = name + strlen(name); + if (suffix > name && suffix[-1] == '_') { + snprintf(buffer, buffer_size, "%.*s", (int)(suffix - name - 1), name); + return; + } + + snprintf(buffer, buffer_size, "%s", name); +} + +static void mfc_trace_pretty_file(const char *file, char *buffer, size_t buffer_size) { + const char *src; + const char *build; + + if (file == NULL || file[0] == '\0' || strcmp(file, "??:0") == 0 || strcmp(file, "??:?") == 0) { + buffer[0] = '\0'; + return; + } + + src = strstr(file, "/src/"); + if (src != NULL) { + snprintf(buffer, buffer_size, "%s", src + 1); + return; + } + + build = strstr(file, "/build/"); + if (build != NULL) { + snprintf(buffer, buffer_size, "%s", build + 1); + return; + } + + snprintf(buffer, buffer_size, "%s", file); +} + +static void mfc_trace_shell_quote(const char *text, char *buffer, size_t buffer_size) { + size_t used = 0; + size_t i; + + if (buffer_size == 0) return; + + buffer[used++] = '\''; + for (i = 0; text != NULL && text[i] != '\0' && used + 5 < buffer_size; ++i) { + if (text[i] == '\'') { + buffer[used++] = '\''; + buffer[used++] = '\\'; + buffer[used++] = '\''; + buffer[used++] = '\''; + } else { + buffer[used++] = text[i]; + } + } + + if (used + 1 < buffer_size) buffer[used++] = '\''; + buffer[used < buffer_size ? used : buffer_size - 1] = '\0'; +} + +static int mfc_trace_symbol_from_addr2line(void *addr, const char *image, char *buffer, size_t buffer_size) { + char quoted_image[1024]; + char command[1400]; + char raw_name[MFC_TRACE_NAME_MAX]; + char raw_file[MFC_TRACE_NAME_MAX]; + char pretty_name[MFC_TRACE_NAME_MAX]; + char pretty_file[MFC_TRACE_NAME_MAX]; + FILE *pipe; + size_t len; + + if (image == NULL || image[0] == '\0') return 0; + + mfc_trace_shell_quote(image, quoted_image, sizeof(quoted_image)); + snprintf(command, sizeof(command), "addr2line -f -C -e %s %p 2>/dev/null", quoted_image, addr); + + pipe = popen(command, "r"); + if (pipe == NULL) return 0; + + if (fgets(raw_name, sizeof(raw_name), pipe) == NULL || fgets(raw_file, sizeof(raw_file), pipe) == NULL) { + (void)pclose(pipe); + return 0; + } + + (void)pclose(pipe); + + len = strlen(raw_name); + while (len > 0 && (raw_name[len - 1] == '\n' || raw_name[len - 1] == '\r')) raw_name[--len] = '\0'; + + len = strlen(raw_file); + while (len > 0 && (raw_file[len - 1] == '\n' || raw_file[len - 1] == '\r')) raw_file[--len] = '\0'; + + if (raw_name[0] == '\0' || strcmp(raw_name, "??") == 0) return 0; + + mfc_trace_format_function_name(raw_name, pretty_name, sizeof(pretty_name)); + mfc_trace_pretty_file(raw_file, pretty_file, sizeof(pretty_file)); + + if (pretty_file[0] != '\0') { + snprintf(buffer, buffer_size, "%s [%s]", pretty_name, pretty_file); + } else { + snprintf(buffer, buffer_size, "%s", pretty_name); + } + + return 1; +} + +static int mfc_trace_call_site_from_addr2line(void *addr, char *buffer, size_t buffer_size) { + Dl_info info; + char quoted_image[1024]; + char command[1400]; + char raw_name[MFC_TRACE_NAME_MAX]; + char raw_file[MFC_TRACE_NAME_MAX]; + char pretty_name[MFC_TRACE_NAME_MAX]; + char pretty_file[MFC_TRACE_NAME_MAX]; + FILE *pipe; + size_t len; + + if (addr == NULL) return 0; + if (dladdr(addr, &info) == 0 || info.dli_fname == NULL) return 0; + + mfc_trace_shell_quote(info.dli_fname, quoted_image, sizeof(quoted_image)); + snprintf(command, sizeof(command), "addr2line -f -C -e %s %p 2>/dev/null", quoted_image, addr); + + pipe = popen(command, "r"); + if (pipe == NULL) return 0; + + if (fgets(raw_name, sizeof(raw_name), pipe) == NULL || fgets(raw_file, sizeof(raw_file), pipe) == NULL) { + (void)pclose(pipe); + return 0; + } + + (void)pclose(pipe); + + len = strlen(raw_name); + while (len > 0 && (raw_name[len - 1] == '\n' || raw_name[len - 1] == '\r')) raw_name[--len] = '\0'; + + len = strlen(raw_file); + while (len > 0 && (raw_file[len - 1] == '\n' || raw_file[len - 1] == '\r')) raw_file[--len] = '\0'; + + if (raw_file[0] == '\0' || strcmp(raw_file, "??:0") == 0 || strcmp(raw_file, "??:?") == 0) return 0; + + mfc_trace_format_function_name(raw_name, pretty_name, sizeof(pretty_name)); + mfc_trace_pretty_file(raw_file, pretty_file, sizeof(pretty_file)); + + if (pretty_name[0] != '\0' && strcmp(pretty_name, "??") != 0) { + snprintf(buffer, buffer_size, " <- called from %s [%s]", pretty_name, pretty_file); + } else { + snprintf(buffer, buffer_size, " <- called from [%s]", pretty_file); + } + + return 1; +} + +static void mfc_trace_symbol(void *addr, char *buffer, size_t buffer_size) { + Dl_info info; + const char *image = NULL; + char resolved[MFC_TRACE_NAME_MAX]; + char dladdr_name[MFC_TRACE_NAME_MAX]; + int dladdr_ok; + int i; + + for (i = 0; i < trace_symbol_cache_count; ++i) { + if (trace_symbol_cache[i].addr == addr) { + snprintf(buffer, buffer_size, "%s", trace_symbol_cache[i].name); + return; + } + } + + dladdr_ok = dladdr(addr, &info); + dladdr_name[0] = '\0'; + if (dladdr_ok != 0) { + image = info.dli_fname; + if (info.dli_sname != NULL) { + mfc_trace_format_function_name(info.dli_sname, dladdr_name, sizeof(dladdr_name)); + } + } + + if (mfc_trace_symbol_from_addr2line(addr, image, resolved, sizeof(resolved))) { + snprintf(buffer, buffer_size, "%s", resolved); + } else if (dladdr_name[0] != '\0') { + snprintf(buffer, buffer_size, "%s", dladdr_name); + } else { + snprintf(buffer, buffer_size, "%p", addr); + } + + if (trace_symbol_cache_count < MFC_TRACE_SYMBOL_CACHE_MAX) { + trace_symbol_cache[trace_symbol_cache_count].addr = addr; + snprintf(trace_symbol_cache[trace_symbol_cache_count].name, sizeof(trace_symbol_cache[trace_symbol_cache_count].name), "%s", buffer); + trace_symbol_cache_count += 1; + } +} + +static int mfc_trace_skip_symbol(const char *name) { + return strstr(name, "__cyg_profile_func_") != NULL || + strstr(name, "mfc_trace_") != NULL || + strstr(name, "m_trace:") != NULL || + strstr(name, "m_trace_s_") != NULL || + strstr(name, "m_nvtx_") != NULL || + strncmp(name, "..acc_", 6) == 0 || + strstr(name, "acc_cuda_funcreg_constructor") != NULL || + strstr(name, "acc_data_constructor") != NULL || + strcmp(name, "s_trace_point_begin") == 0 || + strcmp(name, "s_trace_point_end") == 0; +} + +static int mfc_trace_is_mpi_symbol(const char *name) { + return strstr(name, "m_mpi") != NULL || + strstr(name, "_mpi_") != NULL || + strstr(name, ":mpi_") != NULL || + strstr(name, "mpi_") == name; +} + +static const char *mfc_trace_prefix_color(const char *prefix) { + if (strcmp(prefix, "TRACE_CONTEXT") == 0) return MFC_TRACE_COLOR_CONTEXT; + if (strcmp(prefix, "TRACE_MPI") == 0) return MFC_TRACE_COLOR_MPI; + return MFC_TRACE_COLOR_TRACE; +} + +static void mfc_trace_write_at_depth(const char *name, void *call_site, int depth) { + mfc_trace_write_line("TRACE", name, call_site, depth); +} + +static int mfc_trace_format_line(char *line, size_t line_size, int color, const char *prefix, const char *name, const char *caller, int depth) { + int len = 0; + int i; + + if (color) { + const char *prefix_color = mfc_trace_prefix_color(prefix); + + len = snprintf(line, line_size, "%s%s%s ", prefix_color, prefix, MFC_TRACE_COLOR_RESET); + if (trace_tree_format) { + for (i = 0; i < depth && len + 2 < (int)line_size; ++i) { + line[len++] = ' '; + line[len++] = ' '; + } + } + if (caller[0] != '\0') { + len += snprintf(line + len, line_size - (size_t)len, "%s%s%s%s%s%s\n", MFC_TRACE_COLOR_BOLD, name, + MFC_TRACE_COLOR_RESET, MFC_TRACE_COLOR_DIM, caller, MFC_TRACE_COLOR_RESET); + } else { + len += snprintf(line + len, line_size - (size_t)len, "%s%s%s\n", MFC_TRACE_COLOR_BOLD, name, MFC_TRACE_COLOR_RESET); + } + } else if (!trace_tree_format) { + len = snprintf(line, line_size, "%s %s%s\n", prefix, name, caller); + } else { + len = snprintf(line, line_size, "%s ", prefix); + for (i = 0; i < depth && len + 2 < (int)line_size; ++i) { + line[len++] = ' '; + line[len++] = ' '; + } + if (len < (int)line_size) { + len += snprintf(line + len, line_size - (size_t)len, "%s%s\n", name, caller); + } + } + + if (len > 0 && (size_t)len > line_size) len = (int)line_size; + return len; +} + +static void mfc_trace_write_line(const char *prefix, const char *name, void *call_site, int depth) { + char line[MFC_TRACE_NAME_MAX * 2 + 256]; + char caller[MFC_TRACE_NAME_MAX]; + int len; + + if (mfc_trace_skip_symbol(name)) return; + caller[0] = '\0'; + (void)mfc_trace_call_site_from_addr2line(call_site, caller, sizeof(caller)); + + len = mfc_trace_format_line(line, sizeof(line), 0, prefix, name, caller, depth); + if (len > 0) { + if (trace_file_fd >= 0) (void)write(trace_file_fd, line, (size_t)len); + if (trace_stdout_enabled) { + len = mfc_trace_format_line(line, sizeof(line), trace_color_enabled, prefix, name, caller, depth); + if (len > 0) (void)write(STDOUT_FILENO, line, (size_t)len); + } + } +} + +static void mfc_trace_remember_setup_call(void *addr, void *call_site) { + int parent_depth; + uint64_t parent_id; + int i; + + if (!trace_include_setup || !trace_point_enabled || trace_point_depth > 0) return; + if (trace_stack_depth < 2 || trace_setup_count >= MFC_TRACE_SETUP_MAX) return; + + parent_depth = trace_stack_depth - 2; + if (parent_depth < 2) return; + parent_id = trace_stack_ids[parent_depth]; + + for (i = 0; i < trace_setup_count; ++i) { + if (trace_setup[i].parent_id == parent_id && trace_setup[i].addr == addr) return; + } + + trace_setup[trace_setup_count].addr = addr; + trace_setup[trace_setup_count].call_site = call_site; + trace_setup[trace_setup_count].self_id = trace_stack_ids[trace_stack_depth - 1]; + trace_setup[trace_setup_count].parent_id = parent_id; + trace_setup[trace_setup_count].depth = trace_stack_depth - 1; + trace_setup[trace_setup_count].emitted = 0; + trace_setup_count += 1; +} + +static int mfc_trace_remember_context_call(void *addr, void *call_site) { + int i; + + if (!trace_context_enabled || !trace_point_enabled || trace_point_depth > 0) return 0; + if (trace_context_count >= MFC_TRACE_CONTEXT_MAX) return 0; + + for (i = 0; i < trace_context_count; ++i) { + if (trace_context[i].addr == addr && trace_context[i].call_site == call_site) return 0; + } + + trace_context[trace_context_count].addr = addr; + trace_context[trace_context_count].call_site = call_site; + trace_context_count += 1; + return 1; +} + +static void mfc_trace_write_context_call(void *addr, void *call_site) { + char name[MFC_TRACE_NAME_MAX]; + + if (!mfc_trace_remember_context_call(addr, call_site)) return; + + mfc_trace_symbol(addr, name, sizeof(name)); + mfc_trace_write_line("TRACE_CONTEXT", name, call_site, trace_stack_depth > 0 ? trace_stack_depth - 1 : 0); +} + +static int mfc_trace_remember_mpi_call(void *addr, void *call_site) { + int i; + + if (!trace_mpi_enabled || trace_mpi_context_count >= MFC_TRACE_CONTEXT_MAX) return 0; + + for (i = 0; i < trace_mpi_context_count; ++i) { + if (trace_mpi_context[i].addr == addr && trace_mpi_context[i].call_site == call_site) return 0; + } + + trace_mpi_context[trace_mpi_context_count].addr = addr; + trace_mpi_context[trace_mpi_context_count].call_site = call_site; + trace_mpi_context_count += 1; + return 1; +} + +static void mfc_trace_write_mpi_call(void *addr, void *call_site) { + char name[MFC_TRACE_NAME_MAX]; + + mfc_trace_symbol(addr, name, sizeof(name)); + if (!mfc_trace_is_mpi_symbol(name)) return; + if (!mfc_trace_remember_mpi_call(addr, call_site)) return; + + mfc_trace_write_line("TRACE_MPI", name, call_site, trace_stack_depth > 0 ? trace_stack_depth - 1 : 0); +} + +static int mfc_trace_stack_index_for_id(uint64_t id) { + int i; + + for (i = 0; i < trace_stack_depth; ++i) { + if (trace_stack_ids[i] == id) return i; + } + + return -1; +} + +static void mfc_trace_flush_setup_for_parent(uint64_t parent_id) { + char name[MFC_TRACE_NAME_MAX]; + int i; + + if (!trace_include_setup) return; + + for (i = 0; i < trace_setup_count; ++i) { + if (trace_setup[i].emitted) continue; + if (trace_setup[i].parent_id != parent_id) continue; + if (mfc_trace_stack_index_for_id(trace_setup[i].self_id) >= 0) continue; + + mfc_trace_symbol(trace_setup[i].addr, name, sizeof(name)); + mfc_trace_write_at_depth(name, trace_setup[i].call_site, trace_setup[i].depth); + trace_setup[i].emitted = 1; + } +} + +static int mfc_trace_common_dumped_prefix(void) { + int i; + int prefix = 0; + int depth = trace_last_dump_depth; + + if (depth < 0) return 0; + if (depth > trace_stack_depth) depth = trace_stack_depth; + + for (i = 0; i < depth; ++i) { + if (trace_last_dump_ids[i] != trace_stack_ids[i]) break; + prefix += 1; + } + + return prefix; +} + +static void mfc_trace_remember_dumped_stack(void) { + int i; + + trace_last_dump_depth = trace_stack_depth; + for (i = 0; i < trace_stack_depth; ++i) { + trace_last_dump_ids[i] = trace_stack_ids[i]; + } +} + +static void mfc_trace_dump_stack(void) { + char name[MFC_TRACE_NAME_MAX]; + int i; + int start; + + start = mfc_trace_common_dumped_prefix(); + + for (i = start; i < trace_stack_depth; ++i) { + mfc_trace_symbol(trace_stack[i], name, sizeof(name)); + mfc_trace_write_at_depth(name, trace_stack_call_sites[i], i); + mfc_trace_flush_setup_for_parent(trace_stack_ids[i]); + } + + mfc_trace_remember_dumped_stack(); +} + +void mfc_trace_point_begin(void) { + mfc_trace_initialize(); + if (!trace_enabled || !trace_point_enabled) return; + + if (trace_point_depth == 0) { + trace_in_hook = 1; + mfc_trace_dump_stack(); + trace_in_hook = 0; + } + + trace_point_depth += 1; +} + +void mfc_trace_point_end(void) { + mfc_trace_initialize(); + if (!trace_enabled || !trace_point_enabled) return; + if (trace_point_depth > 0) trace_point_depth -= 1; +} + +void __cyg_profile_func_enter(void *this_fn, void *call_site) { + char name[MFC_TRACE_NAME_MAX]; + (void)call_site; + + if (trace_in_hook) return; + trace_in_hook = 1; + + if (trace_stack_depth < MFC_TRACE_STACK_MAX) { + trace_stack_ids[trace_stack_depth] = trace_next_stack_id++; + trace_stack_call_sites[trace_stack_depth] = call_site; + trace_stack[trace_stack_depth++] = this_fn; + } + + mfc_trace_initialize(); + mfc_trace_remember_setup_call(this_fn, call_site); + if (trace_enabled) { + mfc_trace_write_context_call(this_fn, call_site); + mfc_trace_write_mpi_call(this_fn, call_site); + } + + if (trace_enabled && (!trace_point_enabled || trace_point_depth > 0)) { + mfc_trace_symbol(this_fn, name, sizeof(name)); + mfc_trace_write_at_depth(name, call_site, trace_stack_depth > 0 ? trace_stack_depth - 1 : 0); + } + + trace_in_hook = 0; +} + +void __cyg_profile_func_exit(void *this_fn, void *call_site) { + (void)this_fn; + (void)call_site; + + if (trace_in_hook) return; + if (trace_stack_depth > 0) trace_stack_depth -= 1; +} diff --git a/toolchain/instrument_fortran_trace.py b/toolchain/instrument_fortran_trace.py new file mode 100644 index 0000000000..24e9114184 --- /dev/null +++ b/toolchain/instrument_fortran_trace.py @@ -0,0 +1,363 @@ +#!/usr/bin/env python3 +"""Inject runtime grid-point trace scopes into generated Fortran sources.""" + +from __future__ import annotations + +import argparse +import os +import re +from pathlib import Path + +MODULE_RE = re.compile(r"^(\s*)module\s+([a-z_]\w*)\b", re.IGNORECASE) +PROGRAM_RE = re.compile(r"^(\s*)program\s+([a-z_]\w*)\b", re.IGNORECASE) +ROUTINE_RE = re.compile( + r"^(\s*)(?:(?:(?:impure|recursive|module)\s+)*subroutine\s+([a-z_]\w*)|" + r"(?:(?:impure|recursive|module)\s+)*function\s+([a-z_]\w*)|" + r"(?:(?:pure|elemental|recursive|module|impure)\s+)*" + r"(?:integer|real|logical|complex|character|type\s*\([^)]*\)|class\s*\([^)]*\)|double\s+precision)\b[^!]*?\bfunction\s+([a-z_]\w*))\b", + re.IGNORECASE, +) +END_ROUTINE_RE = re.compile(r"^\s*end\s+(?:subroutine|function|program)\b", re.IGNORECASE) +INTERFACE_RE = re.compile(r"^\s*(?:abstract\s+)?interface\b", re.IGNORECASE) +END_INTERFACE_RE = re.compile(r"^\s*end\s+interface\b", re.IGNORECASE) +END_DO_RE = re.compile(r"(?:^|;)\s*end\s+do\b", re.IGNORECASE) +DO_CAPTURE_RE = re.compile(r"(?:^|;)\s*do\s+([a-z_]\w*)\s*=\s*([^;!]+)", re.IGNORECASE) + +POINT_LOOP_CANDIDATES = ( + ("j", "k", "l"), + ("x", "y", "z"), + ("i", "j", "k"), + ("id1", "id2", "id3"), + ("j_loop", "k_loop", "l_loop"), + ("k_loop", "l_loop", "q_loop"), + ("k_idx", "l_idx", "q_idx"), + ("q_idx", "k_idx", "l_idx"), + ("l_idx", "q_idx", "k_idx"), +) + +DECLARATION_STARTERS = ( + "use ", + "import", + "implicit ", + "integer", + "real", + "double precision", + "complex", + "logical", + "character", + "type", + "class", + "procedure", + "external", + "intrinsic", + "parameter", + "dimension", + "allocatable", + "pointer", + "target", + "optional", + "save", + "common", + "equivalence", + "namelist", + "data ", + "private", + "public", + "volatile", + "asynchronous", + "protected", + "enumerator", + "gpu_routine", +) + + +def strip_comment(line: str) -> str: + in_single = False + in_double = False + for idx, char in enumerate(line): + if char == "'" and not in_double: + in_single = not in_single + elif char == '"' and not in_single: + in_double = not in_double + elif char == "!" and not in_single and not in_double: + return line[:idx] + return line + + +def normalized(line: str) -> str: + return strip_comment(line).strip().lower() + + +def is_preprocessor_or_empty(line: str) -> bool: + text = line.strip() + return not text or text.startswith("#") or text.startswith("!") + + +def is_continuation(line: str) -> bool: + return strip_comment(line).rstrip().endswith("&") + + +def is_specification_line(line: str, spec_block_depth: int) -> bool: + text = normalized(line) + if not text: + return True + if text.startswith("#"): + return True + if text.startswith("&"): + return True + if spec_block_depth > 0: + return True + if "::" in text: + return True + return text.startswith(DECLARATION_STARTERS) + + +def starts_spec_block(line: str) -> bool: + text = normalized(line) + return bool(re.match(r"^(type|enum|interface)\b", text)) and not text.startswith("type(") + + +def ends_spec_block(line: str) -> bool: + text = normalized(line) + return bool(re.match(r"^end\s+(type|enum|interface)\b", text)) + + +def should_skip_routine(line: str) -> bool: + text = normalized(line) + return " pure " in f" {text} " or " elemental " in f" {text} " + + +def routine_name(match: re.Match[str]) -> str: + return next(group for group in match.groups()[1:] if group is not None) + + +def is_x_bound(bounds: str) -> bool: + return bool( + re.search( + r"(^|[^a-z0-9_])(m|m_glb|local_m|x_beg|x_end|offset_x|is1[a-z0-9_]*|isx|ix|idwint\(1\)|idwbuff\(1\)|ibounds\(1\)|bounds\(1\)|isc1|id_norm\(1\))([^a-z0-9_]|$)", + bounds, + ) + ) + + +def is_y_bound(bounds: str) -> bool: + return bool( + re.search( + r"(^|[^a-z0-9_])(n|n_glb|local_n|y_beg|y_end|offset_y|is2[a-z0-9_]*|isy|iy|idwint\(2\)|idwbuff\(2\)|ibounds\(2\)|bounds\(2\)|isc2|id_norm\(2\))([^a-z0-9_]|$)", + bounds, + ) + ) + + +def is_z_bound(bounds: str) -> bool: + return bool( + re.search( + r"(^|[^a-z0-9_])(p|p_glb|local_p|z_beg|z_end|offset_z|is3[a-z0-9_]*|isz|iz|idwint\(3\)|idwbuff\(3\)|ibounds\(3\)|bounds\(3\)|isc3|id_norm\(3\))([^a-z0-9_]|$)", + bounds, + ) + ) + + +def split_do_bounds(bounds: str) -> tuple[str, str]: + parts: list[str] = [] + start = 0 + depth = 0 + in_single = False + in_double = False + + for idx, char in enumerate(bounds): + if char == "'" and not in_double: + in_single = not in_single + elif char == '"' and not in_single: + in_double = not in_double + elif not in_single and not in_double: + if char == "(": + depth += 1 + elif char == ")" and depth > 0: + depth -= 1 + elif char == "," and depth == 0: + parts.append(bounds[start:idx].strip()) + start = idx + 1 + + parts.append(bounds[start:].strip()) + if len(parts) >= 2 and parts[0] and parts[1]: + return parts[0], parts[1] + + return "0", "0" + + +def active_point_vars(loop_stack: list[dict[str, object]]) -> tuple[str, str, str] | None: + active = {str(entry["var"]): str(entry["bounds"]) for entry in loop_stack} + for candidate in POINT_LOOP_CANDIDATES: + if all(var in active for var in candidate) and is_x_bound(active[candidate[0]]) and is_y_bound(active[candidate[1]]) and is_z_bound(active[candidate[2]]): + return candidate + return None + + +def point_scope_active(loop_stack: list[dict[str, object]]) -> bool: + return any(bool(entry["point_scope"]) for entry in loop_stack) + + +def point_begin_line(indent: str, point_vars: tuple[str, str, str], loop_stack: list[dict[str, object]]) -> str: + active = {str(entry["var"]): entry for entry in loop_stack} + point_args = ", ".join(f"int({var})" for var in point_vars) + midpoint_args = [] + for var in point_vars: + entry = active[var] + midpoint_args.append(f"int(({entry['lower']}) + ({entry['upper']}))/2") + return f"{indent}call s_trace_point_begin({point_args}, {', '.join(midpoint_args)})\n" + + +def instrument(lines: list[str], point_traces: bool = True) -> list[str]: + if any(re.match(r"^\s*module\s+m_trace\b", line, re.IGNORECASE) for line in lines): + return lines + + output: list[str] = [] + scope_stack: list[str] = [] + pending_routine: dict[str, str | int] | None = None + routine_stack: list[str] = [] + loop_stack: list[dict[str, object]] = [] + interface_depth = 0 + spec_block_depth = 0 + + idx = 0 + while idx < len(lines): + line = lines[idx] + text = normalized(line) + + if END_INTERFACE_RE.match(line): + interface_depth = max(0, interface_depth - 1) + elif INTERFACE_RE.match(line): + interface_depth += 1 + + if pending_routine is not None: + if is_preprocessor_or_empty(line): + output.append(line) + idx += 1 + continue + + if ends_spec_block(line): + spec_block_depth = max(0, spec_block_depth - 1) + output.append(line) + idx += 1 + continue + + if pending_routine.get("continuation") or is_specification_line(line, spec_block_depth): + output.append(line) + pending_routine["continuation"] = is_continuation(line) + if starts_spec_block(line): + spec_block_depth += 1 + idx += 1 + continue + + routine_stack.append(str(pending_routine["name"])) + pending_routine = None + spec_block_depth = 0 + continue + + module_match = MODULE_RE.match(line) + if module_match and not text.startswith("module procedure") and not text.startswith("end module"): + module_name = module_match.group(2) + output.append(line) + if module_name.lower() != "m_trace": + output.append(f"{module_match.group(1)} use m_trace, only: s_trace_point_begin, s_trace_point_end\n") + scope_stack.append(module_name) + idx += 1 + continue + + program_match = PROGRAM_RE.match(line) + if program_match and not text.startswith("end program"): + program_name = program_match.group(2) + output.append(line) + output.append(f"{program_match.group(1)} use m_trace, only: s_trace_point_begin, s_trace_point_end\n") + pending_routine = { + "indent": f"{program_match.group(1)} ", + "name": program_name, + "continuation": is_continuation(line), + } + idx += 1 + continue + + if text.startswith("end module") and scope_stack: + scope_stack.pop() + + routine_match = ROUTINE_RE.match(line) + if routine_match and interface_depth == 0 and not text.startswith("end ") and " procedure " not in f" {text} " and not should_skip_routine(line): + output.append(line) + if not scope_stack: + output.append(f"{routine_match.group(1)} use m_trace, only: s_trace_point_begin, s_trace_point_end\n") + pending_routine = { + "indent": f"{routine_match.group(1)} ", + "name": routine_name(routine_match), + "continuation": is_continuation(line), + } + idx += 1 + continue + + if routine_stack and END_ROUTINE_RE.match(line): + routine_stack.pop() + loop_stack = [] + output.append(line) + idx += 1 + continue + + if routine_stack and point_traces: + code_line = strip_comment(line) + end_do_count = len(END_DO_RE.findall(code_line)) + if end_do_count: + end_indent = re.match(r"^(\s*)", line).group(1) + " " + for _ in range(min(end_do_count, len(loop_stack))): + loop_entry = loop_stack.pop() + if loop_entry["point_scope"]: + output.append(f"{end_indent}call s_trace_point_end()\n") + else: + begin_after_line: list[str] = [] + for do_match in DO_CAPTURE_RE.finditer(code_line): + bounds = " ".join(do_match.group(2).lower().split()) + lower, upper = split_do_bounds(bounds) + loop_stack.append( + { + "var": do_match.group(1).lower(), + "bounds": bounds, + "lower": lower, + "upper": upper, + "point_scope": False, + } + ) + point_vars = active_point_vars(loop_stack) + if point_vars is not None and not point_scope_active(loop_stack): + loop_stack[-1]["point_scope"] = True + begin_indent = re.match(r"^(\s*)", line).group(1) + " " + begin_after_line.append(point_begin_line(begin_indent, point_vars, loop_stack)) + + if begin_after_line: + output.append(line) + output.extend(begin_after_line) + idx += 1 + continue + + output.append(line) + idx += 1 + + return output + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument("--no-point-traces", action="store_true", help="Skip grid-point trace probes in accelerated GPU builds.") + parser.add_argument("input", type=Path) + parser.add_argument("output", type=Path) + args = parser.parse_args() + + lines = args.input.read_text().splitlines(keepends=True) + traced = instrument(lines, point_traces=not args.no_point_traces) + + args.output.parent.mkdir(parents=True, exist_ok=True) + tmp = args.output.with_suffix(args.output.suffix + ".tmp") + tmp.write_text("".join(traced)) + os.replace(tmp, args.output) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/toolchain/mfc/build.py b/toolchain/mfc/build.py index 722d27a0d7..6f386b1ba2 100644 --- a/toolchain/mfc/build.py +++ b/toolchain/mfc/build.py @@ -420,6 +420,7 @@ def configure(self, case: Case): flags.append(f"-DMFC_GCov={'ON' if ARG('gcov') else 'OFF'}") flags.append(f"-DMFC_Unified={'ON' if ARG('unified') else 'OFF'}") flags.append(f"-DMFC_Fastmath={'ON' if ARG('fastmath') else 'OFF'}") + flags.append(f"-DMFC_TRACE_CALLS={'ON' if ARG('trace') else 'OFF'}") command = ["cmake"] + flags + ["-S", cmake_dirpath, "-B", build_dirpath] diff --git a/toolchain/mfc/cli/argparse_gen.py b/toolchain/mfc/cli/argparse_gen.py index e869ae5bbb..34092c83fb 100644 --- a/toolchain/mfc/cli/argparse_gen.py +++ b/toolchain/mfc/cli/argparse_gen.py @@ -70,7 +70,7 @@ def _add_positional(parser: argparse.ArgumentParser, pos: Positional): parser.add_argument(pos.name, **kwargs) -def _add_mfc_config_arguments(parser: argparse.ArgumentParser, config): +def _add_mfc_config_arguments(parser: argparse.ArgumentParser, config, exclude=None): """ Add MFCConfig boolean pair arguments dynamically. @@ -79,7 +79,10 @@ def _add_mfc_config_arguments(parser: argparse.ArgumentParser, config): # Import here to avoid circular dependency from ..state import gpuConfigOptions + exclude = set(exclude or []) for f in dataclasses.fields(config): + if f.name in exclude: + continue if f.name == "gpu": parser.add_argument( f"--{f.name}", @@ -109,7 +112,7 @@ def _add_common_arguments(parser: argparse.ArgumentParser, command: Command, com # Handle MFC config flags specially if common_set.mfc_config_flags and config is not None: - _add_mfc_config_arguments(parser, config) + _add_mfc_config_arguments(parser, config, command.exclude_config_flags) else: for arg in common_set.arguments: _add_argument(parser, arg) diff --git a/toolchain/mfc/cli/commands.py b/toolchain/mfc/cli/commands.py index 1e1898b117..79eb1f8f3f 100644 --- a/toolchain/mfc/cli/commands.py +++ b/toolchain/mfc/cli/commands.py @@ -146,12 +146,14 @@ Example("./mfc.sh build", "Build all default targets (CPU)"), Example("./mfc.sh build -j 8", "Build with 8 parallel jobs"), Example("./mfc.sh build --gpu", "Build with GPU (OpenACC) support"), + Example("./mfc.sh build --trace", "Build with generated runtime call tracing"), Example("./mfc.sh build -i case.py --case-optimization -j 8", "Case optimization (10x faster!)"), ], key_options=[ ("-j, --jobs N", "Number of parallel build jobs"), ("-t, --targets", "Targets: pre_process, simulation, post_process"), ("--gpu [acc|mp]", "Enable GPU support (OpenACC or OpenMP)"), + ("--trace", "Enable generated runtime call tracing"), ("--case-optimization", "Hard-code case params for 10x speedup"), ("--debug", "Build in debug mode"), ], @@ -163,6 +165,7 @@ help="Run a case with MFC.", description="Run an MFC simulation case interactively or submit as a batch job.", include_common=["targets", "mfc_config", "jobs", "verbose", "debug_log", "gpus"], + exclude_config_flags=["trace"], positionals=[ Positional( name="input", @@ -274,6 +277,13 @@ default=False, dest="no_build", ), + Argument( + name="trace", + help="Emit a runtime call trace. CPU builds include the local middle grid point; GPU builds trace host calls only.", + action=ArgAction.STORE_TRUE, + default=False, + dest="runtime_trace", + ), Argument( name="wait", help="(Batch) Wait for the job to finish.", @@ -350,6 +360,7 @@ Example("./mfc.sh run case.py -e batch -N 2 -n 4", "Submit batch job: 2 nodes, 4 ranks/node"), Example("./mfc.sh run case.py --archive /mnt/nas/mfc-runs", "Archive run into /mnt/nas/mfc-runs/-/"), Example("./mfc.sh run case.py --archive /mnt/nas/mfc-runs --archive-format tar.zst", "Archive as a compressed tarball"), + Example("./mfc.sh run case.py --trace", "Trace runtime calls; CPU builds include the local middle grid point"), ], key_options=[ ("--case-optimization", "Hard-code params for 10x speedup!"), @@ -360,6 +371,7 @@ ("-w, --walltime", "Wall time limit (batch)"), ("--archive PATH", "Archive inputs+outputs after interactive run"), ("--archive-format FMT", "Archive format: dir, tar, tar.zst"), + ("--trace", "Emit a runtime call trace"), ], ) diff --git a/toolchain/mfc/cli/completion_gen.py b/toolchain/mfc/cli/completion_gen.py index de1ac0cd98..7f18a8a08c 100644 --- a/toolchain/mfc/cli/completion_gen.py +++ b/toolchain/mfc/cli/completion_gen.py @@ -22,6 +22,7 @@ def _collect_all_options(cmd: Command, schema: CLISchema) -> List[str]: """Collect all option flags for a command including common sets.""" options: Set[str] = set() + excluded_config_flags = set(cmd.exclude_config_flags) # Common arguments for set_name in cmd.include_common: @@ -31,26 +32,20 @@ def _collect_all_options(cmd: Command, schema: CLISchema) -> List[str]: # MFC config flags if common_set.mfc_config_flags: - options.update( - [ - "--mpi", - "--no-mpi", - "--gpu", - "--no-gpu", - "--debug", - "--no-debug", - "--gcov", - "--no-gcov", - "--unified", - "--no-unified", - "--single", - "--no-single", - "--mixed", - "--no-mixed", - "--fastmath", - "--no-fastmath", - ] - ) + config_options = { + "mpi": ["--mpi", "--no-mpi"], + "gpu": ["--gpu", "--no-gpu"], + "debug": ["--debug", "--no-debug"], + "gcov": ["--gcov", "--no-gcov"], + "unified": ["--unified", "--no-unified"], + "single": ["--single", "--no-single"], + "mixed": ["--mixed", "--no-mixed"], + "fastmath": ["--fastmath", "--no-fastmath"], + "trace": ["--trace", "--no-trace"], + } + for name, flags in config_options.items(): + if name not in excluded_config_flags: + options.update(flags) else: for arg in common_set.arguments: if arg.short: @@ -318,32 +313,27 @@ def _generate_zsh_command_args(cmd: Command, schema: CLISchema) -> List[str]: arg_lines.append(_zsh_completion_for_positional(pos, i + 1)) # Options from common sets + excluded_config_flags = set(cmd.exclude_config_flags) for set_name in cmd.include_common: common_set = schema.get_common_set(set_name) if common_set is None: continue if common_set.mfc_config_flags: - arg_lines.extend( - [ - "'--mpi[Enable MPI]'", - "'--no-mpi[Disable MPI]'", - "'--gpu[Enable GPU]:mode:(acc mp)'", - "'--no-gpu[Disable GPU]'", - "'--debug[Build with debug compiler flags (for MFC code)]'", - "'--no-debug[Build without debug flags]'", - "'--gcov[Enable gcov coverage]'", - "'--no-gcov[Disable gcov coverage]'", - "'--unified[Enable unified memory]'", - "'--no-unified[Disable unified memory]'", - "'--single[Enable single precision]'", - "'--no-single[Disable single precision]'", - "'--mixed[Enable mixed precision]'", - "'--no-mixed[Disable mixed precision]'", - "'--fastmath[Enable fast math]'", - "'--no-fastmath[Disable fast math]'", - ] - ) + config_args = { + "mpi": ["'--mpi[Enable MPI]'", "'--no-mpi[Disable MPI]'"], + "gpu": ["'--gpu[Enable GPU]:mode:(acc mp)'", "'--no-gpu[Disable GPU]'"], + "debug": ["'--debug[Build with debug compiler flags (for MFC code)]'", "'--no-debug[Build without debug flags]'"], + "gcov": ["'--gcov[Enable gcov coverage]'", "'--no-gcov[Disable gcov coverage]'"], + "unified": ["'--unified[Enable unified memory]'", "'--no-unified[Disable unified memory]'"], + "single": ["'--single[Enable single precision]'", "'--no-single[Disable single precision]'"], + "mixed": ["'--mixed[Enable mixed precision]'", "'--no-mixed[Disable mixed precision]'"], + "fastmath": ["'--fastmath[Enable fast math]'", "'--no-fastmath[Disable fast math]'"], + "trace": ["'--trace[Enable generated runtime call tracing]'", "'--no-trace[Disable generated runtime call tracing]'"], + } + for name, args in config_args.items(): + if name not in excluded_config_flags: + arg_lines.extend(args) else: for arg in common_set.arguments: desc = arg.help.replace("'", "").replace("[", "").replace("]", "")[:120] diff --git a/toolchain/mfc/cli/docs_gen.py b/toolchain/mfc/cli/docs_gen.py index 8d7347a0aa..a30a1e9186 100644 --- a/toolchain/mfc/cli/docs_gen.py +++ b/toolchain/mfc/cli/docs_gen.py @@ -92,10 +92,16 @@ def _generate_options_table(cmd: Command, schema: CLISchema) -> List[str]: # Add MFC config flags if included if "mfc_config" in cmd.include_common: - lines.append("| `--mpi`, `--no-mpi` | Enable/disable MPI | `true` |") - lines.append("| `--gpu [acc/mp]`, `--no-gpu` | Enable GPU (OpenACC/OpenMP) | `no` |") - lines.append("| `--debug`, `--no-debug` | Build with full debug compiler flags | `false` |") - lines.append("| `--reldebug`, `--no-reldebug` | Build with lightweight debug flags (CI) | `false` |") + config_rows = { + "mpi": "| `--mpi`, `--no-mpi` | Enable/disable MPI | `true` |", + "gpu": "| `--gpu [acc/mp]`, `--no-gpu` | Enable GPU (OpenACC/OpenMP) | `no` |", + "debug": "| `--debug`, `--no-debug` | Build with full debug compiler flags | `false` |", + "reldebug": "| `--reldebug`, `--no-reldebug` | Build with lightweight debug flags (CI) | `false` |", + "trace": "| `--trace`, `--no-trace` | Enable generated runtime call tracing; CPU builds include the local middle grid point | `false` |", + } + for name, row in config_rows.items(): + if name not in cmd.exclude_config_flags: + lines.append(row) lines.append("") @@ -282,6 +288,7 @@ def generate_cli_reference(schema: CLISchema) -> str: "| `--debug` / `--no-debug` | Build with full debug compiler flags |", "| `--reldebug` / `--no-reldebug` | Build with lightweight debug flags (CI) |", "| `--gcov` / `--no-gcov` | Enable code coverage |", + "| `--trace` / `--no-trace` | Generated runtime call tracing; CPU builds include the local middle grid point |", "| `--single` / `--no-single` | Single precision |", "| `--mixed` / `--no-mixed` | Mixed precision |", "", diff --git a/toolchain/mfc/cli/schema.py b/toolchain/mfc/cli/schema.py index 871d98f6da..a195d632ae 100644 --- a/toolchain/mfc/cli/schema.py +++ b/toolchain/mfc/cli/schema.py @@ -135,6 +135,7 @@ class Command: # Inherit common argument sets include_common: List[str] = field(default_factory=list) # e.g., ["targets", "jobs"] + exclude_config_flags: List[str] = field(default_factory=list) # Subcommands (for nested commands like "packer pack") subcommands: List["Command"] = field(default_factory=list) diff --git a/toolchain/mfc/run/run.py b/toolchain/mfc/run/run.py index 82e886c064..3789ef6816 100644 --- a/toolchain/mfc/run/run.py +++ b/toolchain/mfc/run/run.py @@ -61,6 +61,15 @@ def __profiler_prepend() -> typing.List[str]: return [] +def __runner_prepend() -> typing.List[str]: + profiler = __profiler_prepend() + + if ARG("runtime_trace", False) and ARG("gpu") != gpuConfigOptions.NONE.value: + return ["python3", os.path.join(MFC_ROOT_DIR, "toolchain", "mfc", "run", "trace.py")] + profiler + + return profiler + + def get_baked_templates() -> dict: return {os.path.splitext(os.path.basename(f))[0]: file_read(f) for f in glob(os.path.join(MFC_TEMPLATE_DIR, "*.mako"))} @@ -103,6 +112,26 @@ def __generate_job_script(targets, case: input.MFCInputFile): gpu_acc = gpu_mode == gpuConfigOptions.ACC.value gpu_mp = gpu_mode == gpuConfigOptions.MP.value + if ARG("runtime_trace", False): + trace_file = os.path.abspath(os.path.join(os.path.dirname(ARG("input")), "trace.log")) + with open(trace_file, "w", encoding="utf-8"): + pass + env.update( + { + "MFC_TRACE": "1", + "MFC_TRACE_POINT": "middle", + "MFC_TRACE_FORMAT": "tree", + "MFC_TRACE_INCLUDE_SETUP": "1", + "MFC_TRACE_CONTEXT": "1", + "MFC_TRACE_FILE": trace_file, + "MFC_TRACE_STDOUT": "1" if sys.stdout.isatty() else "0", + } + ) + if gpu_acc or gpu_mp: + env.update({"MFC_TRACE_GPU_NOTIFY": "1", "NVCOMPILER_ACC_NOTIFY": "1", "NV_ACC_NOTIFY": "1"}) + if ARG("mpi"): + env.update({"MFC_TRACE_MPI": "1"}) + content = __get_template().render( **{**ARGS(), "targets": targets}, ARG=ARG, @@ -111,7 +140,7 @@ def __generate_job_script(targets, case: input.MFCInputFile): MFC_ROOT_DIR=MFC_ROOT_DIR, SIMULATION=SIMULATION, qsystem=queues.get_system(), - profiler=shlex.join(__profiler_prepend()), + profiler=shlex.join(__runner_prepend()), gpu_enabled=gpu_enabled, gpu_acc=gpu_acc, gpu_mp=gpu_mp, diff --git a/toolchain/mfc/run/trace.py b/toolchain/mfc/run/trace.py new file mode 100644 index 0000000000..457be4edd5 --- /dev/null +++ b/toolchain/mfc/run/trace.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import collections +import os +import re +import subprocess +import sys + + +def _normalize_acc_line(line: str) -> str: + line = re.sub(r"file=[^ ]*/(src/[^ ]*)", r"file=\1", line) + line = re.sub(r"[^ ]*/(src/[^ ]*)", r"\1", line) + line = re.sub(r"devaddr=0x[0-9A-Fa-f]+", "devaddr=", line) + return line + + +def _is_acc_noise(line: str) -> bool: + prefixes = ( + "Enter ", + "Leave ", + "Wait ", + "Implicit wait", + "upload CUDA data", + "download CUDA data", + "create CUDA data", + "delete CUDA data", + "alloc", + ) + return line.startswith(prefixes) or (".fpp function=" in line and " device=" in line) + + +def main() -> int: + if len(sys.argv) < 2: + print("usage: trace.py [args...]", file=sys.stderr) + return 2 + + command = sys.argv[1:] + + if any(arg.endswith("/syscheck") for arg in command): + env = os.environ.copy() + env["MFC_TRACE"] = "0" + env.pop("MFC_TRACE_GPU_NOTIFY", None) + env.pop("MFC_TRACE_ACC_NOTIFY", None) + env["NVCOMPILER_ACC_NOTIFY"] = "0" + env["NV_ACC_NOTIFY"] = "0" + return subprocess.run(command, env=env, check=False).returncode + + trace_file = os.environ.get("MFC_TRACE_FILE") + if not (os.environ.get("MFC_TRACE_GPU_NOTIFY") or os.environ.get("MFC_TRACE_ACC_NOTIFY")) or not trace_file: + return subprocess.run(command, check=False).returncode + + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + assert process.stdout is not None + + seen = set() + repeats = collections.Counter() + + with open(trace_file, "a", encoding="utf-8") as trace: + for output_line in process.stdout: + line = output_line.rstrip("\n") + if line.startswith("launch CUDA kernel"): + short_line = _normalize_acc_line(line) + if short_line in seen: + repeats[short_line] += 1 + else: + seen.add(short_line) + trace.write(f"TRACE_ACC {short_line}\n") + trace.flush() + elif not _is_acc_noise(line): + print(line, flush=True) + + for line, count in sorted(repeats.items()): + trace.write(f"TRACE_ACC_REPEAT x{count} {line}\n") + + return process.wait() + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/toolchain/mfc/state.py b/toolchain/mfc/state.py index 94a37be947..d9595511cd 100644 --- a/toolchain/mfc/state.py +++ b/toolchain/mfc/state.py @@ -21,6 +21,7 @@ class MFCConfig: single: bool = False mixed: bool = False fastmath: bool = False + trace: bool = False def __hash__(self): return hash(tuple(getattr(self, f.name) for f in dataclasses.fields(self))) @@ -32,7 +33,7 @@ def from_dict(d: dict): r = MFCConfig() for field in dataclasses.fields(MFCConfig): - setattr(r, field.name, d[field.name]) + setattr(r, field.name, d.get(field.name, getattr(r, field.name))) return r