integer copy: A(i) -> B(i) icopy: 1 134217728 1.88758 71.1058MFlop/s icopy: 2 67108864 1.61759 82.9738MFlop/s icopy: 4 33554432 1.21315 110.635MFlop/s icopy: 8 16777216 0.943617 142.238MFlop/s icopy: 16 8388608 0.724883 185.158MFlop/s icopy: 32 4194304 0.632276 212.277MFlop/s icopy: 64 2097152 0.585846 229.101MFlop/s icopy: 128 1048576 0.613241 218.866MFlop/s icopy: 256 524288 0.575591 233.182MFlop/s icopy: 512 262144 0.562549 238.589MFlop/s icopy: 1024 131072 0.557041 240.948MFlop/s icopy: 2048 65536 0.553881 242.322MFlop/s icopy: 4096 32768 0.859391 156.178MFlop/s icopy: 8192 16384 0.923778 145.292MFlop/s icopy: 16384 8192 0.908659 147.71MFlop/s icopy: 32768 4096 0.866429 154.909MFlop/s icopy: 65536 2048 0.832227 161.275MFlop/s icopy: 131072 1024 0.917907 146.221MFlop/s icopy: 262144 512 0.940208 142.753MFlop/s icopy: 524288 256 0.938848 142.96MFlop/s icopy: 1048576 128 0.944368 142.124MFlop/s integer axpy: A(i)*Alpha + B(i) -> B(i) iaxpy: 1 33554432 0.714486 93.9261MFlop/s iaxpy: 2 16777216 0.606717 110.61MFlop/s iaxpy: 4 8388608 0.488635 137.339MFlop/s iaxpy: 8 4194304 0.387773 173.062MFlop/s iaxpy: 16 2097152 0.303389 221.197MFlop/s iaxpy: 32 1048576 0.259165 258.943MFlop/s iaxpy: 64 524288 0.235088 285.463MFlop/s iaxpy: 128 262144 0.230798 290.769MFlop/s iaxpy: 256 131072 0.220794 303.943MFlop/s iaxpy: 512 65536 0.215769 311.022MFlop/s iaxpy: 1024 32768 0.213402 314.472MFlop/s iaxpy: 2048 16384 0.216926 309.363MFlop/s iaxpy: 4096 8192 0.313553 214.027MFlop/s iaxpy: 8192 4096 0.316371 212.121MFlop/s iaxpy: 16384 2048 0.318113 210.959MFlop/s iaxpy: 32768 1024 0.318728 210.552MFlop/s iaxpy: 65536 512 0.321135 208.974MFlop/s iaxpy: 131072 256 0.337958 198.572MFlop/s iaxpy: 262144 128 0.337717 198.713MFlop/s iaxpy: 524288 64 0.337522 198.828MFlop/s iaxpy: 1048576 32 0.337766 198.684MFlop/s integer gemm: Alpha*A*B + Beta*C -> C igemm: 1 16777217 0.977596 34.3234MFlop/s igemm: 2 2097153 0.423417 79.2468MFlop/s igemm: 4 262145 0.281659 119.132MFlop/s igemm: 8 32769 0.180372 186.035MFlop/s igemm: 16 4097 0.121527 276.174MFlop/s igemm: 32 513 0.095811 350.899MFlop/s igemm: 64 65 0.089958 378.829MFlop/s igemm: 128 9 0.102865 366.974MFlop/s igemm: 256 2 0.168485 398.308MFlop/s igemm: 512 1 0.724611 370.455MFlop/s igemm: 1024 1 5.56539 385.864MFlop/s float axpy: A(i)*Alpha + B(i) -> B(i) faxpy: 1 33554432 0.612352 109.592MFlop/s faxpy: 2 16777216 0.606675 110.617MFlop/s faxpy: 4 8388608 0.606576 110.636MFlop/s faxpy: 8 4194304 0.640333 104.803MFlop/s faxpy: 16 2097152 0.619254 108.37MFlop/s faxpy: 32 1048576 0.612921 109.49MFlop/s faxpy: 64 524288 0.609989 110.017MFlop/s faxpy: 128 262144 0.616826 108.797MFlop/s faxpy: 256 131072 0.611411 109.761MFlop/s faxpy: 512 65536 0.609064 110.184MFlop/s faxpy: 1024 32768 0.607799 110.413MFlop/s faxpy: 2048 16384 0.60774 110.424MFlop/s faxpy: 4096 8192 0.615278 109.071MFlop/s faxpy: 8192 4096 0.617283 108.717MFlop/s faxpy: 16384 2048 0.62105 108.057MFlop/s faxpy: 32768 1024 0.621796 107.927MFlop/s faxpy: 65536 512 0.637908 105.201MFlop/s faxpy: 131072 256 0.634582 105.753MFlop/s faxpy: 262144 128 0.633291 105.968MFlop/s faxpy: 524288 64 0.632839 106.044MFlop/s faxpy: 1048576 32 0.632434 106.112MFlop/s float gemm: Alpha*A*B + Beta*C -> C fgemm: 1 16777217 0.825751 40.6351MFlop/s fgemm: 2 2097153 0.375941 89.2546MFlop/s fgemm: 4 262145 0.27169 123.503MFlop/s fgemm: 8 32769 0.188021 178.467MFlop/s fgemm: 16 4097 0.141637 236.962MFlop/s fgemm: 32 513 0.121051 277.734MFlop/s fgemm: 64 65 0.116432 292.692MFlop/s fgemm: 128 9 0.134333 281.009MFlop/s fgemm: 256 2 0.227608 294.844MFlop/s fgemm: 512 1 0.97239 276.057MFlop/s fgemm: 1024 1 7.4889 286.756MFlop/s double axpy: A(i)*Alpha + B(i) -> B(i) daxpy: 1 33554432 0.613503 109.386MFlop/s daxpy: 2 16777216 0.606637 110.624MFlop/s daxpy: 4 8388608 0.606629 110.626MFlop/s daxpy: 8 4194304 0.640303 104.808MFlop/s daxpy: 16 2097152 0.619224 108.376MFlop/s daxpy: 32 1048576 0.612891 109.496MFlop/s daxpy: 64 524288 0.609753 110.059MFlop/s daxpy: 128 262144 0.618436 108.514MFlop/s daxpy: 256 131072 0.611357 109.77MFlop/s daxpy: 512 65536 0.609016 110.192MFlop/s daxpy: 1024 32768 0.608714 110.247MFlop/s daxpy: 2048 16384 0.636687 105.403MFlop/s daxpy: 4096 8192 0.639517 104.937MFlop/s daxpy: 8192 4096 0.64245 104.458MFlop/s daxpy: 16384 2048 0.642502 104.449MFlop/s daxpy: 32768 1024 0.653713 102.658MFlop/s daxpy: 65536 512 0.675695 99.3183MFlop/s daxpy: 131072 256 0.675219 99.3883MFlop/s daxpy: 262144 128 0.675139 99.4001MFlop/s daxpy: 524288 64 0.675176 99.3946MFlop/s daxpy: 1048576 32 0.675118 99.4032MFlop/s double gemm: Alpha*A*B + Beta*C -> C dgemm: 1 16777217 0.859402 39.0439MFlop/s dgemm: 2 2097153 0.374929 89.4955MFlop/s dgemm: 4 262145 0.270685 123.962MFlop/s dgemm: 8 32769 0.187946 178.538MFlop/s dgemm: 16 4097 0.141765 236.748MFlop/s dgemm: 32 513 0.12493 269.11MFlop/s dgemm: 64 65 0.127269 267.769MFlop/s dgemm: 128 9 0.143386 263.267MFlop/s dgemm: 256 2 0.252687 265.581MFlop/s dgemm: 512 1 1.02391 262.167MFlop/s dgemm: 1024 1 8.11441 264.65MFlop/s optimized float gemm: Alpha*A*B + Beta*C -> C sgemm: 8 32769 0.127395 263.397MFlop/s sgemm: 16 4097 0.0524575 639.806MFlop/s sgemm: 32 513 0.0312485 1075.89MFlop/s sgemm: 64 65 0.0247532 1376.74MFlop/s sgemm: 128 9 0.0136254 2770.47MFlop/s sgemm: 256 2 0.0209692 3200.35MFlop/s sgemm: 512 1 0.0768407 3493.4MFlop/s sgemm: 1024 1 0.593705 3617.09MFlop/s sgemm: 2048 1 4.73977 3624.62MFlop/s optimized double gemm: Alpha*A*B + Beta*C -> C dgemm: 8 32769 0.208578 160.877MFlop/s dgemm: 16 4097 0.114323 293.577MFlop/s dgemm: 32 513 0.091746 366.446MFlop/s dgemm: 64 65 0.086877 392.264MFlop/s dgemm: 128 9 0.046618 809.746MFlop/s dgemm: 256 2 0.080234 836.414MFlop/s dgemm: 512 1 0.318708 842.261MFlop/s dgemm: 1024 1 2.50245 858.153MFlop/s dgemm: 2048 1 19.9557 860.901MFlop/s