mfournial · mfournial · Apr 22, 2020 · Apr 22, 2020 · Apr 22, 2020 · Apr 22, 2020
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -69,10 +69,9 @@ add_definitions(-DFAASM_PROFILE_ON=0)
 # Custom LLVM build (also for profiling)
 set(FAASM_CUSTOM_LLVM 0)
 if (${FAASM_CUSTOM_LLVM})
-    message(STATUS "Using custom LLVM")
-
-    # Force using custom build of LLVM for codegen
-    set(LLVM_DIR $ENV{HOME}/faasm/llvm/build/lib/cmake/llvm)
+    # In accordance with bin/build_llvm_perf.sh and LLVM version for WAVM
+    set(LLVM_DIR /usr/local/code/llvm-perf/build/lib/cmake/llvm)
+    message(STATUS "Using custom LLVM at ${LLVM_DIR} for profiling")
     find_package(LLVM REQUIRED CONFIG NO_DEFAULT_PATH)
 
     set(WAVM_PERF_LIBS 1)

diff --git a/ansible/musl_native.yml b/ansible/musl_native.yml
@@ -0,0 +1,6 @@
+---
+
+- hosts: localhost
+  gather_facts: yes
+  roles:
+    - musl_native
diff --git a/ansible/roles/musl_native/tasks/main.yml b/ansible/roles/musl_native/tasks/main.yml
@@ -0,0 +1,26 @@
+---
+
+- name: "Clone musl"
+  git:
+    repo: 'git://git.musl-libc.org/musl'
+    dest: /usr/local/code/musl_native
+    version: v1.1.24
+
+- name: "Build"
+  shell: "{{ item }}"
+  args:
+    chdir: "/usr/local/code/musl_native"
+  with_items:
+    - "CC=/usr/bin/clang ./configure --prefix=/usr/local/faasm/musl_native"
+    - "make -j 10"
+
+- name: "Install"
+  become: yes
+  shell: "{{ item }}"
+  args:
+    chdir: "/usr/local/code/musl_native"
+  with_items:
+    - "mkdir -p /usr/local/faasm/musl_native"
+    - "sudo make install"
+
+
diff --git a/bin/build_llvm_perf.sh b/bin/build_llvm_perf.sh
@@ -1,30 +1,38 @@
 #!/bin/bash
 set -e
+set -x
 
-FAASM_DIR=${HOME}/faasm
-LLVM_DIR=${FAASM_DIR}/llvm
+CODE_DIR=/usr/local/code/
+if [[ ! -d ${CODE_DIR} ]]; then
+  echo "CODE_DIR (${CODE_DIR}) does not exists"
+  exit 1
+fi
+
+LLVM_DIR=${CODE_DIR}/llvm-perf
 BUILD_DIR=${LLVM_DIR}/build
 
-# Clone LLVM repo if not done already
-mkdir -p ${FAASM_DIR}
+# Clone LLVM-9 repo if not done already. Version must be compatible with WAVM
 if [[ ! -d "$LLVM_DIR" ]]; then
     git clone https://github.com/llvm/llvm-project ${LLVM_DIR}
     pushd ${LLVM_DIR}
     git checkout llvmorg-9.0.0-rc3
     popd
 fi
 
-# Clear build dir
-# rm -rf ${BUILD_DIR}
-
-# Make the build dir if it exists
+# Make the build dir
 mkdir -p ${BUILD_DIR}
-
 pushd ${BUILD_DIR}
 
+# Clear build dir
+if [[ -f "${BUILD_DIR}/build.ninja" ]]; then
+  ninja clean
+fi
+
 # Enable JIT profiling features of LLVM
 cmake \
 	-G Ninja \
+	-DCMAKE_C_COMPILER=/usr/bin/clang \
+	-DCMAKE_CXX_COMPILER=/usr/bin/clang++ \
 	-DCMAKE_BUILD_TYPE=RelWithDebInfo \
 	-DLLVM_USE_PERF=1 \
 	-DLLVM_USE_INTEL_JITEVENTS=1 \

diff --git a/docker/omp-musl.dockerfile b/docker/omp-musl.dockerfile
@@ -0,0 +1,13 @@
+FROM alpine
+
+RUN apk update && \
+    apk add g++ libgomp make
+
+# Compile omp programs
+COPY func/omp /build
+WORKDIR /build
+RUN make
+
+# clean up
+RUN rm -rf /build
+WORKDIR /
diff --git a/docs/perf-diff/explaination.md b/docs/perf-diff/explaination.md
@@ -0,0 +1,142 @@
+# Perf shows that the loops are doing the same
+
+## Log function
+
+Experiment code [here](../../func/demo/log_difference.cpp).
+
+### Perf stat
+
+#### Native
+
+```
+ Performance counter stats for 'speed_difference':
+
+      14530.607857      task-clock (msec)         #    1.000 CPUs utilized
+                 7      context-switches          #    0.000 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+               589      page-faults               #    0.041 K/sec
+    46,636,539,230      cycles                    #    3.210 GHz
+   114,771,451,699      instructions              #    2.46  insn per cycle
+    13,208,253,517      branches                  #  908.995 M/sec
+           196,535      branch-misses             #    0.00% of all branches
+
+      14.531197506 seconds time elapsed
+```
+
+#### Wasm
+
+```
+ Performance counter stats for 'simple_runner demo speed_difference':
+
+       4852.352536      task-clock (msec)         #    1.000 CPUs utilized
+                 3      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+             5,611      page-faults               #    0.001 M/sec
+    14,531,719,357      cycles                    #    2.995 GHz
+    35,368,260,394      instructions              #    2.43  insn per cycle
+     4,120,953,595      branches                  #  849.269 M/sec
+         5,078,676      branch-misses             #    0.12% of all branches
+
+       4.852843140 seconds time elapsed
+```
+
+### Perf report
+
+Symbol table:
+```
+functionDef35       log
+functionDef2        __original_main
+```
+
+The global time spent in each section is very similar:
+![Time division](time-division.png)
+
+The loop bodies (pictures below) look quite similar -- unless the unknown/unknown:XX are something to worry about. The Wasm code 
+fiddles a lot with the MM registers before the loop. Manually compiling the native code with `clang++ -O3 -mavx2 -S` for comparison lead
+to the same code as shown below.
+
+![main native](main-native.png)
+
+![main Wasm](main-wasm.png)
+
+I think this is not a good experiment for general-purpose speed comparison between Wasm and Native because it seems that
+here the only difference is that MUSL's log is faster than GLibC's log. When looking at the `log` function ran in Wasm,
+it is a lot smaller than the GLibC one, which is the only explanation I have here for the speed difference.
+
+This experiment is however useful for understanding our second experiment because there is a log function taking 25% of
+the run time of the native Mersenne Twister engine experiment but there is no sign of `log` seems absent from the Wasm
+which however leads to the same result. I can only think the log was somehow optimised away?
+
+### Confirmation
+
+Compiling similar code at `-O3` natively but changing libc implementations gives the following:
+* glibc: 14.2
+* musl: 6.3
+
+Confirming our hypothesis that the musl `log` function is better.
+
+## Mersenne Twister Engine
+
+Experiment code [here](../../func/demo/mt_difference.cpp).
+
+### Perf stats
+
+#### Native
+
+
+```
+       9170.419967      task-clock (msec)         #    1.000 CPUs utilized
+                 5      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+               588      page-faults               #    0.064 K/sec
+    27,822,779,162      cycles                    #    3.034 GHz
+    17,508,333,503      instructions              #    0.63  insn per cycle
+     2,560,064,376      branches                  #  279.165 M/sec
+         1,122,999      branch-misses             #    0.04% of all branches
+
+       9.171005174 seconds time elapsed
+
+```
+
+#### Wasm
+
+```
+
+
+       1317.060034      task-clock (msec)         #    1.000 CPUs utilized
+                 1      context-switches          #    0.001 K/sec
+                 0      cpu-migrations            #    0.000 K/sec
+             5,610      page-faults               #    0.004 M/sec
+     2,992,325,962      cycles                    #    2.272 GHz
+     6,657,716,630      instructions              #    2.22  insn per cycle
+       218,836,121      branches                  #  166.155 M/sec
+         5,013,145      branch-misses             #    2.29% of all branches
+
+       1.317535341 seconds time elapsed
+```
+
+
+###  Perf Report
+
+On the left, Wasm on the right, native. The symbol table is as follows. The
+fact we only have the `main` function in Wasm seem to indicate there is some
+inlining going on.
+```
+functionDef2        __original_main
+```
+
+![Perf report MT](mt-time.png)
+
+And indeed we can compare the loop bodies in the main function we the native simply
+calls the MT engine. And the Wasm code seem to be that same code but inlined (omitted
+because it was too long).
+
+The Native main loop uses `callq`, I'm not sure about perf's measures about the cost
+of instructions (`callq` < `mov` < `add`), and it seems clear that it doesn't include
+the time spent in `callq` because perf inverts the call graph.
+
+### Confirmation
+
+TODO confirm that:
+* Wasm code generation is more prone to inlining which helped in this loopy context
+* Wasm optimised the `logl` away, or else what happened to it?
diff --git a/docs/perf-diff/main-native.png b/docs/perf-diff/main-native.png
diff --git a/docs/perf-diff/main-wasm.png b/docs/perf-diff/main-wasm.png
diff --git a/docs/perf-diff/mt-time.png b/docs/perf-diff/mt-time.png
diff --git a/docs/perf-diff/time-division.png b/docs/perf-diff/time-division.png
diff --git a/func/demo/CMakeLists.txt b/func/demo/CMakeLists.txt
@@ -43,13 +43,15 @@ demo_func(increment increment.cpp)
 demo_func(isatty isatty.cpp)
 demo_func(listdir listdir.cpp)
 demo_func(lock lock.c)
+demo_func(log_difference log_difference.cpp)
 demo_func(malloc malloc.cpp)
 demo_func(matrix matrix.cpp)
 demo_func(memcpy memcpy.cpp)
 demo_func(memmove memmove.cpp)
 demo_func(mmap mmap.cpp)
 demo_func(mmap_big mmap_big.cpp)
 demo_func(mmap_file mmap_file.cpp)
+demo_func(mt_difference mt_difference.cpp)
 demo_func(new_obj new_obj.cpp)
 demo_func(noop noop.c)
 demo_func(optarg optarg.cpp)

diff --git a/func/demo/log_difference.cpp b/func/demo/log_difference.cpp
@@ -0,0 +1,16 @@
+#include <cstdio>
+#include <cstdint>
+
+const int32_t EXPECTED = -2147483529;
+
+int main() {
+    int32_t result = 100;
+    for (int32_t i = -200000000; i < 200000000; i++) {
+        result ^= ((int32_t) log(abs(i))) & ((int32_t) log(abs(i))) ;
+    }
+    if (result != EXPECTED) {
+        printf("Custom reduction failed. Expected %d but got %d\n", EXPECTED, result);
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/func/demo/mt_difference.cpp b/func/demo/mt_difference.cpp
@@ -0,0 +1,20 @@
+#include <cstdio>
+#include <cstdint>
+#include <random>
+
+const int32_t EXPECTED = 49999327;
+
+int main() {
+    std::uniform_real_distribution<double> unif(0, 1);
+    std::mt19937_64 generator(193734715);
+    int32_t result = 0;
+
+    for (int32_t i = 0; i < 100000000; i++) {
+        result += unif(generator) < 0.5 ? 1 : 0;
+    }
+    if (result != EXPECTED) {
+        printf("Custom reduction failed. Expected %d but got %d\n", EXPECTED, result);
+        return EXIT_FAILURE;
+    }
+    return EXIT_SUCCESS;
+}
diff --git a/func/omp/Makefile b/func/omp/Makefile
@@ -0,0 +1,16 @@
+# Makefile for building native_musl targets in the omp-musl container
+
+CXX=g++
+CXX_FLAGS=-mavx2 -O3
+OMP_FLAGS=-fopenmp
+OUT_DIR=/usr/bin
+
+.PHONY : all
+
+all : mt_pi docker_benchmarker
+
+mt_pi: mt_pi.cpp
+	$(CXX) $(CXX_FLAGS) $(OMP_FLAGS) $< -o $(OUT_DIR)/$@
+
+docker_benchmarker: docker_benchmarker.cpp
+	$(CXX) $(CXX_FLAGS) $< -o $(OUT_DIR)/$@
diff --git a/func/omp/docker_benchmarker.cpp b/func/omp/docker_benchmarker.cpp
@@ -0,0 +1,69 @@
+#include <cstdio>
+#include <chrono>
+#include <fstream>
+#include <vector>
+#include <sys/stat.h>
+
+constexpr auto BINARY_DIR = "/usr/bin/";
+
+void
+nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
+          int num_threads);
+
+int main(int argc, char *argv[]) {
+
+    if (argc != 3) {
+        printf("Usage:\ndocker_benchmarker <function> <nNative>\n");
+        return 1;
+    }
+
+    std::string funcName = argv[1];
+    int nativeIterations = std::stoi(argv[2]);
+
+    std::string outfile = std::string("/wasm/omp/") + funcName + "/bench.csv";
+
+    std::ofstream profOut;
+    profOut.open(outfile);
+    profOut << "iterations,numThreads,type,microseconds" << std::endl;
+
+    std::string execPath = std::string(BINARY_DIR) + funcName;
+
+    std::vector<long> iterations = {200000l, 20000000l, 200000000l};
+    std::vector<std::string> iter_names = {"Tiny,", "Small,", "Big,"};
+
+    for (int run = 1; run <= nativeIterations; run++) {
+        printf("NATIVE - %s (%d/%d)\n", funcName.c_str(), run, nativeIterations);
+        for (size_t i = 0; i < iterations.size(); i++) {
+            nativeRun(profOut, execPath, iterations[i], iter_names[i], 1);
+            for (int num_threads = 2; num_threads < 25; num_threads += 2) {
+                nativeRun(profOut, execPath, iterations[i], iter_names[i], num_threads);
+            }
+        }
+    }
+
+    profOut.flush();
+    profOut.close();
+
+    chmod(outfile.c_str(),S_IRWXU | S_IRWXG | S_IRWXO);
+
+    return 0;
+}
+
+
+void
+nativeRun(std::ofstream &profOut, const std::string &execPath, long num_iterations, const std::string &iteration_name,
+          int num_threads) {
+
+    char buffer [128];
+    sprintf(buffer, "%s %d %ld", execPath.c_str(), num_threads, num_iterations);
+
+    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
+    int error = system(buffer);
+    const int nativeTime = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - t1).count();
+
+    if (error != 0) {
+        printf("Failed to execute %s", buffer);
+    }
+
+    profOut << iteration_name << num_threads << ",native," << nativeTime << std::endl;
+}