Bagaimana untuk mencapai maksimum teori 4 FLOP setiap kitaran?
Secara teorinya mungkin untuk mencapai prestasi puncak 4 titik terapung operasi (kepersisan berganda) setiap kitaran pada CPU Intel x86-64 moden, dengan menggunakan yang berikut teknik:
Mengoptimumkan Kod untuk arahan SSE
Gelung buka gulungan dan celahan
Operasi pengelompokan secara bertiga
Mengelakkan gerai dan kebergantungan yang tidak perlu
Kod contoh
Coretan kod berikut menunjukkan cara untuk mencapai prestasi hampir ke puncak pada CPU Intel Core i5 dan Core i7 :
#include <emmintrin.h> #include <omp.h> #include <iostream> using namespace std; typedef unsigned long long uint64; double test_dp_mac_SSE(double x, double y, uint64 iterations) { register __m128d r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, rA, rB, rC, rD, rE, rF; // Generate starting data. r0 = _mm_set1_pd(x); r1 = _mm_set1_pd(y); r8 = _mm_set1_pd(-0.0); r2 = _mm_xor_pd(r0, r8); r3 = _mm_or_pd(r0, r8); r4 = _mm_andnot_pd(r8, r0); r5 = _mm_mul_pd(r1, _mm_set1_pd(0.37796447300922722721)); r6 = _mm_mul_pd(r1, _mm_set1_pd(0.24253562503633297352)); r7 = _mm_mul_pd(r1, _mm_set1_pd(4.1231056256176605498)); r8 = _mm_add_pd(r0, _mm_set1_pd(0.37796447300922722721)); r9 = _mm_add_pd(r1, _mm_set1_pd(0.24253562503633297352)); rA = _mm_sub_pd(r0, _mm_set1_pd(4.1231056256176605498)); rB = _mm_sub_pd(r1, _mm_set1_pd(4.1231056256176605498)); rC = _mm_set1_pd(1.4142135623730950488); rD = _mm_set1_pd(1.7320508075688772935); rE = _mm_set1_pd(0.57735026918962576451); rF = _mm_set1_pd(0.70710678118654752440); uint64 iMASK = 0x800fffffffffffffull; __m128d MASK = _mm_set1_pd(*(double*)&iMASK); __m128d vONE = _mm_set1_pd(1.0); uint64 c = 0; while (c < iterations) { size_t i = 0; while (i < 1000) { // Main computational loop r0 = _mm_mul_pd(r0, rC); r1 = _mm_add_pd(r1, rD); r2 = _mm_mul_pd(r2, rE); r3 = _mm_sub_pd(r3, rF); r4 = _mm_mul_pd(r4, rC); r5 = _mm_add_pd(r5, rD); r6 = _mm_mul_pd(r6, rE); r7 = _mm_sub_pd(r7, rF); r8 = _mm_mul_pd(r8, rC); r9 = _mm_add_pd(r9, rD); rA = _mm_mul_pd(rA, rE); rB = _mm_sub_pd(rB, rF); r0 = _mm_add_pd(r0, rF); r1 = _mm_mul_pd(r1, rE); r2 = _mm_sub_pd(r2, rD); r3 = _mm_mul_pd(r3, rC); r4 = _mm_add_pd(r4, rF); r5 = _mm_mul_pd(r5, rE); r6 = _mm_sub_pd(r6, rD); r7 = _mm_mul_pd(r7, rC); r8 = _mm_add_pd(r8, rF); r9 = _mm_mul_pd(r9, rE); rA = _mm_sub_pd(rA, rD); rB = _mm_mul_pd(rB, rC); r0 = _mm_mul_pd(r0, rC); r1 = _mm_add_pd(r1, rD); r2 = _mm_mul_pd(r2, rE); r3 = _mm_sub_pd(r3, rF); r4 = _mm_mul_pd(r4, rC); r5 = _mm_add_pd(r5, rD); r6 = _mm_mul_pd(r6, rE); r7 = _mm_sub_pd(r7, rF); r8 = _mm_mul_pd(r8, rC); r9 = _mm_add_pd(r9, rD);
Atas ialah kandungan terperinci Bagaimana untuk Mencapai 4 FLOP Setiap Kitaran pada CPU Intel x86-64 Moden?. Untuk maklumat lanjut, sila ikut artikel berkaitan lain di laman web China PHP!