在本系列的前兩期中,我們考慮了 Perl 中浮動操作的性能,
Python 和 R 在一個玩具範例中計算函數 cos(sin(sqrt(x))),其中 x 是一個 非常大 50M 雙精確度浮點數數組。
將算術密集型部分委託給 C 的混合實作是效能最高的實作之一。在本期中,我們將稍微偏離主題,看看玩具範例的純 C 程式碼實現的效能。
C 程式碼將提供有關記憶體局部性對於效能重要性的進一步見解(預設情況下,C 陣列中的元素儲存在記憶體中的順序位址中,以及數字API,例如PDL 或numpy 與此類容器的介面)相對於容器,
例如Perl 數組不將其值儲存在記憶體中的連續位址中。最後但同樣重要的是,C 程式碼實作將允許我們評估與低階編譯器(在本例中為 gcc)的浮點運算相關的標誌是否會影響效能。
這一點值得強調:普通人在「管道」安裝或建置內聯檔時完全依賴編譯器標誌的選擇。如果一個人不觸及這些標誌,那麼人們就會幸福地不知道他們可能會錯過什麼,或者他們可能會避免的陷阱。
簡單的 C 檔案 makefile 允許人們明確地進行此類效能評估。
下面完整列出了我們玩具範例的 C 程式碼。程式碼是不言自明的,因此除了指出它包含
的四個函數之外,不會花時間解釋在這種情況下,人們可能希望編譯器足夠聰明,能夠識別平方根映射到彙編中的打包(向量化)浮點操作,以便可以使用適當的SIMD 指令對一個函數進行向量化(請注意,我們做了一個不使用OpenMP 程式碼的simd 程式)。
也許向量化帶來的加速可以抵消重複存取(或不存取)相同記憶體位置所造成的效能損失。
#include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #include <omp.h> // simulates a large array of random numbers double* simulate_array(int num_of_elements,int seed); // OMP environment functions void _set_openmp_schedule_from_env(); void _set_num_threads_from_env(); // functions to modify C arrays void map_c_array(double* array, int len); void map_c_array_sequential(double* array, int len); void map_C_array_using_OMP(double* array, int len); void map_C_array_sequential_using_OMP(double* array, int len); int main(int argc, char *argv[]) { if (argc != 2) { printf("Usage: %s <array_size>\n", argv[0]); return 1; } int array_size = atoi(argv[1]); // printf the array size printf("Array size: %d\n", array_size); double *array = simulate_array(array_size, 1234); // Set OMP environment _set_openmp_schedule_from_env(); _set_num_threads_from_env(); // Perform calculations and collect timing data double start_time, end_time, elapsed_time; // Non-Sequential calculation start_time = omp_get_wtime(); map_c_array(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Non-sequential calculation time: %f seconds\n", elapsed_time); free(array); // Sequential calculation array = simulate_array(array_size, 1234); start_time = omp_get_wtime(); map_c_array_sequential(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Sequential calculation time: %f seconds\n", elapsed_time); free(array); array = simulate_array(array_size, 1234); // Parallel calculation using OMP start_time = omp_get_wtime(); map_C_array_using_OMP(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Parallel calculation using OMP time: %f seconds\n", elapsed_time); free(array); // Sequential calculation using OMP array = simulate_array(array_size, 1234); start_time = omp_get_wtime(); map_C_array_sequential_using_OMP(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Sequential calculation using OMP time: %f seconds\n", elapsed_time); free(array); return 0; } /* ******************************************************************************* * OMP environment functions ******************************************************************************* */ void _set_openmp_schedule_from_env() { char *schedule_env = getenv("OMP_SCHEDULE"); printf("Schedule from env %s\n", getenv("OMP_SCHEDULE")); if (schedule_env != NULL) { char *kind_str = strtok(schedule_env, ","); char *chunk_size_str = strtok(NULL, ","); omp_sched_t kind; if (strcmp(kind_str, "static") == 0) { kind = omp_sched_static; } else if (strcmp(kind_str, "dynamic") == 0) { kind = omp_sched_dynamic; } else if (strcmp(kind_str, "guided") == 0) { kind = omp_sched_guided; } else { kind = omp_sched_auto; } int chunk_size = atoi(chunk_size_str); omp_set_schedule(kind, chunk_size); } } void _set_num_threads_from_env() { char *num = getenv("OMP_NUM_THREADS"); printf("Number of threads = %s from within C\n", num); omp_set_num_threads(atoi(num)); } /* ******************************************************************************* * Functions that modify C arrays whose address is passed from Perl in C ******************************************************************************* */ double* simulate_array(int num_of_elements, int seed) { srand(seed); // Seed the random number generator double *array = (double *)malloc(num_of_elements * sizeof(double)); for (int i = 0; i < num_of_elements; i++) { array[i] = (double)rand() / RAND_MAX; // Generate a random double between 0 and 1 } return array; } void map_c_array(double *array, int len) { for (int i = 0; i < len; i++) { array[i] = cos(sin(sqrt(array[i]))); } } void map_c_array_sequential(double* array, int len) { for (int i = 0; i < len; i++) { array[i] = sqrt(array[i]); } for (int i = 0; i < len; i++) { array[i] = sin(array[i]); } for (int i = 0; i < len; i++) { array[i] = cos(array[i]); } } void map_C_array_using_OMP(double* array, int len) { #pragma omp parallel { #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = cos(sin(sqrt(array[i]))); } } } void map_C_array_sequential_using_OMP(double* array, int len) { #pragma omp parallel { #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = sqrt(array[i]); } #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = sin(array[i]); } #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = cos(array[i]); } } }
一個關鍵問題是使用快速浮動編譯器標誌(一種以速度換取程式碼準確性的技巧)是否會影響效能。
這是沒有這個編譯器標誌的 makefile
CC = gcc CFLAGS = -O3 -ftree-vectorize -march=native -Wall -std=gnu11 -fopenmp -fstrict-aliasing LDFLAGS = -fPIE -fopenmp LIBS = -lm SOURCES = inplace_array_mod_with_OpenMP.c OBJECTS = $(SOURCES:.c=_noffmath_gcc.o) EXECUTABLE = inplace_array_mod_with_OpenMP_noffmath_gcc all: $(SOURCES) $(EXECUTABLE) clean: rm -f $(OBJECTS) $(EXECUTABLE) $(EXECUTABLE): $(OBJECTS) $(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@ %_noffmath_gcc.o : %.c $(CC) $(CFLAGS) -c $< -o $@
這是帶有此標誌的:
CC = gcc CFLAGS = -O3 -ftree-vectorize -march=native -Wall -std=gnu11 -fopenmp -fstrict-aliasing -ffast-math LDFLAGS = -fPIE -fopenmp LIBS = -lm SOURCES = inplace_array_mod_with_OpenMP.c OBJECTS = $(SOURCES:.c=_gcc.o) EXECUTABLE = inplace_array_mod_with_OpenMP_gcc all: $(SOURCES) $(EXECUTABLE) clean: rm -f $(OBJECTS) $(EXECUTABLE) $(EXECUTABLE): $(OBJECTS) $(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@ %_gcc.o : %.c $(CC) $(CFLAGS) -c $< -o $@
這是執行這兩個程式的結果
OMP_SCHEDULE=guided,1 OMP_NUM_THREADS=8 ./inplace_array_mod_with_OpenMP_noffmath_gcc 50000000 Array size: 50000000 Schedule from env guided,1 Number of threads = 8 from within C Non-sequential calculation time: 1.12 seconds Sequential calculation time: 0.95 seconds Parallel calculation using OMP time: 0.17 seconds Sequential calculation using OMP time: 0.15 seconds
OMP_SCHEDULE=guided,1 OMP_NUM_THREADS=8 ./inplace_array_mod_with_OpenMP_gcc 50000000 Array size: 50000000 Schedule from env guided,1 Number of threads = 8 from within C Non-sequential calculation time: 0.27 seconds Sequential calculation time: 0.28 seconds Parallel calculation using OMP time: 0.05 seconds Sequential calculation using OMP time: 0.06 seconds
請注意,可以在 Numba 程式碼中使用 fastmath,如下所示(預設為 fastmath=False):
@njit(nogil=True,fastmath=True) def compute_inplace_with_numba(array): np.sqrt(array,array) np.sin(array,array) np.cos(array,array)
值得注意的幾點:
標題:「效能追求第三部分:C Force」
在本系列的前两期中,我们考虑了 Perl 中浮动操作的性能,
Python 和 R 在一个玩具示例中计算函数 cos(sin(sqrt(x))),其中 x 是一个 非常大 50M 双精度浮点数数组。
将算术密集型部分委托给 C 的混合实现是性能最高的实现之一。在本期中,我们将稍微偏离主题,看看玩具示例的纯 C 代码实现的性能。
C 代码将提供有关内存局部性对于性能重要性的进一步见解(默认情况下,C 数组中的元素存储在内存中的顺序地址中,以及数字 API,例如 PDL 或 numpy 与此类容器的接口)相对于容器,
例如Perl 数组不将其值存储在内存中的连续地址中。最后但同样重要的是,C 代码实现将允许我们评估与低级编译器(在本例中为 gcc)的浮点运算相关的标志是否会影响性能。
这一点值得强调:普通人在“管道”安装或构建内联文件时完全依赖于编译器标志的选择。如果一个人不触及这些标志,那么人们就会幸福地不知道他们可能会错过什么,或者他们可能会避免的陷阱。
简陋的 C 文件 makefile 允许人们明确地进行此类性能评估。
下面完整列出了我们玩具示例的 C 代码。该代码相当不言自明,因此除了指出它包含
的四个函数之外,不会花时间解释在这种情况下,人们可能希望编译器足够聪明,能够识别平方根映射到汇编中的打包(矢量化)浮点操作,以便可以使用适当的 SIMD 指令对一个函数进行矢量化(请注意,我们做了不使用 OpenMP 代码的 simd 程序)。
也许矢量化带来的加速可以抵消重复访问(或不访问)相同内存位置所造成的性能损失。
#include <stdlib.h> #include <string.h> #include <math.h> #include <stdio.h> #include <omp.h> // simulates a large array of random numbers double* simulate_array(int num_of_elements,int seed); // OMP environment functions void _set_openmp_schedule_from_env(); void _set_num_threads_from_env(); // functions to modify C arrays void map_c_array(double* array, int len); void map_c_array_sequential(double* array, int len); void map_C_array_using_OMP(double* array, int len); void map_C_array_sequential_using_OMP(double* array, int len); int main(int argc, char *argv[]) { if (argc != 2) { printf("Usage: %s <array_size>\n", argv[0]); return 1; } int array_size = atoi(argv[1]); // printf the array size printf("Array size: %d\n", array_size); double *array = simulate_array(array_size, 1234); // Set OMP environment _set_openmp_schedule_from_env(); _set_num_threads_from_env(); // Perform calculations and collect timing data double start_time, end_time, elapsed_time; // Non-Sequential calculation start_time = omp_get_wtime(); map_c_array(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Non-sequential calculation time: %f seconds\n", elapsed_time); free(array); // Sequential calculation array = simulate_array(array_size, 1234); start_time = omp_get_wtime(); map_c_array_sequential(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Sequential calculation time: %f seconds\n", elapsed_time); free(array); array = simulate_array(array_size, 1234); // Parallel calculation using OMP start_time = omp_get_wtime(); map_C_array_using_OMP(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Parallel calculation using OMP time: %f seconds\n", elapsed_time); free(array); // Sequential calculation using OMP array = simulate_array(array_size, 1234); start_time = omp_get_wtime(); map_C_array_sequential_using_OMP(array, array_size); end_time = omp_get_wtime(); elapsed_time = end_time - start_time; printf("Sequential calculation using OMP time: %f seconds\n", elapsed_time); free(array); return 0; } /* ******************************************************************************* * OMP environment functions ******************************************************************************* */ void _set_openmp_schedule_from_env() { char *schedule_env = getenv("OMP_SCHEDULE"); printf("Schedule from env %s\n", getenv("OMP_SCHEDULE")); if (schedule_env != NULL) { char *kind_str = strtok(schedule_env, ","); char *chunk_size_str = strtok(NULL, ","); omp_sched_t kind; if (strcmp(kind_str, "static") == 0) { kind = omp_sched_static; } else if (strcmp(kind_str, "dynamic") == 0) { kind = omp_sched_dynamic; } else if (strcmp(kind_str, "guided") == 0) { kind = omp_sched_guided; } else { kind = omp_sched_auto; } int chunk_size = atoi(chunk_size_str); omp_set_schedule(kind, chunk_size); } } void _set_num_threads_from_env() { char *num = getenv("OMP_NUM_THREADS"); printf("Number of threads = %s from within C\n", num); omp_set_num_threads(atoi(num)); } /* ******************************************************************************* * Functions that modify C arrays whose address is passed from Perl in C ******************************************************************************* */ double* simulate_array(int num_of_elements, int seed) { srand(seed); // Seed the random number generator double *array = (double *)malloc(num_of_elements * sizeof(double)); for (int i = 0; i < num_of_elements; i++) { array[i] = (double)rand() / RAND_MAX; // Generate a random double between 0 and 1 } return array; } void map_c_array(double *array, int len) { for (int i = 0; i < len; i++) { array[i] = cos(sin(sqrt(array[i]))); } } void map_c_array_sequential(double* array, int len) { for (int i = 0; i < len; i++) { array[i] = sqrt(array[i]); } for (int i = 0; i < len; i++) { array[i] = sin(array[i]); } for (int i = 0; i < len; i++) { array[i] = cos(array[i]); } } void map_C_array_using_OMP(double* array, int len) { #pragma omp parallel { #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = cos(sin(sqrt(array[i]))); } } } void map_C_array_sequential_using_OMP(double* array, int len) { #pragma omp parallel { #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = sqrt(array[i]); } #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = sin(array[i]); } #pragma omp for schedule(runtime) nowait for (int i = 0; i < len; i++) { array[i] = cos(array[i]); } } }
一个关键问题是使用快速浮动编译器标志(一种以速度换取代码准确性的技巧)是否会影响性能。
这是没有这个编译器标志的 makefile
CC = gcc CFLAGS = -O3 -ftree-vectorize -march=native -Wall -std=gnu11 -fopenmp -fstrict-aliasing LDFLAGS = -fPIE -fopenmp LIBS = -lm SOURCES = inplace_array_mod_with_OpenMP.c OBJECTS = $(SOURCES:.c=_noffmath_gcc.o) EXECUTABLE = inplace_array_mod_with_OpenMP_noffmath_gcc all: $(SOURCES) $(EXECUTABLE) clean: rm -f $(OBJECTS) $(EXECUTABLE) $(EXECUTABLE): $(OBJECTS) $(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@ %_noffmath_gcc.o : %.c $(CC) $(CFLAGS) -c $< -o $@
这是带有此标志的:
CC = gcc CFLAGS = -O3 -ftree-vectorize -march=native -Wall -std=gnu11 -fopenmp -fstrict-aliasing -ffast-math LDFLAGS = -fPIE -fopenmp LIBS = -lm SOURCES = inplace_array_mod_with_OpenMP.c OBJECTS = $(SOURCES:.c=_gcc.o) EXECUTABLE = inplace_array_mod_with_OpenMP_gcc all: $(SOURCES) $(EXECUTABLE) clean: rm -f $(OBJECTS) $(EXECUTABLE) $(EXECUTABLE): $(OBJECTS) $(CC) $(LDFLAGS) $(OBJECTS) $(LIBS) -o $@ %_gcc.o : %.c $(CC) $(CFLAGS) -c $< -o $@
这是运行这两个程序的结果
OMP_SCHEDULE=guided,1 OMP_NUM_THREADS=8 ./inplace_array_mod_with_OpenMP_noffmath_gcc 50000000 Array size: 50000000 Schedule from env guided,1 Number of threads = 8 from within C Non-sequential calculation time: 1.12 seconds Sequential calculation time: 0.95 seconds Parallel calculation using OMP time: 0.17 seconds Sequential calculation using OMP time: 0.15 seconds
OMP_SCHEDULE=guided,1 OMP_NUM_THREADS=8 ./inplace_array_mod_with_OpenMP_gcc 50000000 Array size: 50000000 Schedule from env guided,1 Number of threads = 8 from within C Non-sequential calculation time: 0.27 seconds Sequential calculation time: 0.28 seconds Parallel calculation using OMP time: 0.05 seconds Sequential calculation using OMP time: 0.06 seconds
请注意,可以在 Numba 代码中使用 fastmath,如下所示(默认为 fastmath=False):
@njit(nogil=True,fastmath=True) def compute_inplace_with_numba(array): np.sqrt(array,array) np.sin(array,array) np.cos(array,array)
值得注意的几点:
以上是追求性能第三部分:C Force的詳細內容。更多資訊請關注PHP中文網其他相關文章!