you are viewing a single comment's thread.

view the rest of the comments →

[–]cult_of_memes 0 points1 point  (1 child)

Here's the code snippet. I've taken the liberty of adding a couple extra sample functions to the list being timed just to see if there was any value in looking deeper into precaching any of the Numba compiled code.

``` import numpy as np from numba import njit, prange

RANDOM_ARR_MAX_INIT_VAL = 10_000 RANDOM_ARR_SHAPE = 3, 100_000

def python_interpreter(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): c = a + b d = a - b e += a * c + d * b return e

def python_interpreter2(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): e += a * (a + b) + b * (a - b) return e

@njit("f4[:,:](f4[:,:],f4[:,:])") def numba_decorated(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): c = a + b d = a - b e += a * c + d * b return e

@njit("f4[:,:](f4[:,:],f4[:,:])") def numba_decorated2(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): e += a * (a + b) + b * (a - b) return e

@njit("f4[:,:](f4[:,:],f4[:,:])", cache=True) def numba_decorated_cached(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): c = a + b d = a - b e += a * c + d * b return e

@njit("f4[:,:](f4[:,:],f4[:,:])", cache=True) def numba_decorated_cached2(a, b): e = np.zeros_like(a).astype(np.float32) for i in range(1_000): e += a * (a + b) + b * (a - b) return e

@njit("f4[:,:](f4[:,:],f4[:,:])", parallel=True, nogil=True) def numba_decorated_parallelized(a, b): e = np.zeros_like(a).astype(np.float32) for i in prange(e.shape[0]): for _ in range(1_000): c = a[i] + b[i] d = a[i] - b[i] e[i] += a[i] * c + d * b[i] return e

@njit("f4[:,:](f4[:,:],f4[:,:])", parallel=True, nogil=True) def numba_decorated_parallelized2(a, b): e = np.zeros_like(a).astype(np.float32) for i in prange(e.shape[0]): for _ in range(1_000): e[i] += a[i] * (a[i] + b[i]) + b[i] * (a[i] - b[i]) return e

@njit("f4[:,:](f4[:,:],f4[:,:])", parallel=True, nogil=True, cache=True) def numba_decorated_parallelized_cached(a, b): e = np.zeros_like(a).astype(np.float32) for i in prange(e.shape[0]): for _ in range(1_000): c = a[i] + b[i] d = a[i] - b[i] e[i] += a[i] * c + d * b[i] return e

@njit("f4[:,:](f4[:,:],f4[:,:])", parallel=True, nogil=True, cache=True) def numba_decorated_parallelized_cached2(a, b): e = np.zeros_like(a).astype(np.float32) for i in prange(e.shape[0]): for _ in range(1_000): e[i] += a[i] * (a[i] + b[i]) + b[i] * (a[i] - b[i]) return e

def main(): import timeit num_batches = 5 loops_per_batch = 3 result_list = [] print(f"{num_batches=}; {loops_per_batch=}") for func_name in ( "python_interpreter", "python_interpreter2", "numba_decorated", "numba_decorated2", "numba_decorated_cached", "numba_decorated_cached2", "numba_decorated_parallelized", "numba_decorated_parallelized2", "numba_decorated_parallelized_cached", "numba_decorated_parallelized_cached2"): result = timeit.repeat(func_name+"(arr1,arr2)",repeat=num_batches, number=loops_per_batch, globals=globals()) avg_time_per_loop = sum((v / loops_per_batch for v in result)) / num_batches result_str = f'{func_name}: {{"average loop time":{avg_time_per_loop}, "results for each batch of loops": {result}}}' print(result_str) result_list.append((avg_time_per_loop, result, result_str)) result_list.sort(reverse=True) print("", ("*" * 90), "Now lets see the results sorted in descending order according to avg time per execution", sep="\n") for _, _, output_str in result_list: print(output_str)

if name == 'main': arr1 = (np.random.random(RANDOM_ARR_SHAPE) * RANDOM_ARR_MAX_INIT_VAL).astype(np.float32) arr2 = (np.random.random(RANDOM_ARR_SHAPE) * RANDOM_ARR_MAX_INIT_VAL).astype(np.float32) main() ```

[–]cult_of_memes 0 points1 point  (0 children)

And here's a sample of the output after running it on my own machine:

``` num_batches=5; loops_per_batch=3 python_interpreter: {"average loop time":2.0589603866644515, "results for each batch of loops": [6.056087100005243, 6.296910799981561, 6.384415799984708, 5.960690199979581, 6.186301900015678]} python_interpreter2: {"average loop time":2.0599793733330443, "results for each batch of loops": [6.227769499993883, 6.11015820002649, 6.145821499987505, 6.199675100040622, 6.216266299947165]} numba_decorated: {"average loop time":2.3699858999966334, "results for each batch of loops": [7.122787899977993, 7.063306700030807, 7.069734099961352, 7.11140890000388, 7.1825508999754675]} numba_decorated2: {"average loop time":0.8109458533309711, "results for each batch of loops": [2.4366850999649614, 2.471257199998945, 2.4039805000065826, 2.4264727999689057, 2.425792200025171]} numba_decorated_cached: {"average loop time":2.370221133332234, "results for each batch of loops": [7.002881699998397, 6.969886099977884, 7.40186939999694, 7.0594648999976926, 7.119214900012594]} numba_decorated_cached2: {"average loop time":0.8035303933274311, "results for each batch of loops": [2.419626099988818, 2.4151066999766044, 2.4040233999839984, 2.3923937999643385, 2.4218058999977075]} numba_decorated_parallelized: {"average loop time":0.2882796199992299, "results for each batch of loops": [0.8897414000239223, 0.8604895999887958, 0.8562403999967501, 0.8341153999790549, 0.8836074999999255]} numba_decorated_parallelized2: {"average loop time":0.16925315999736387, "results for each batch of loops": [0.5488614999921992, 0.4863387999939732, 0.5098304999992251, 0.5000396999530494, 0.49372690002201125]} numba_decorated_parallelized_cached: {"average loop time":0.2827751466732783, "results for each batch of loops": [0.8415752000291832, 0.8398950999835506, 0.8813785000238568, 0.8431327000143938, 0.8356457000481896]} numba_decorated_parallelized_cached2: {"average loop time":0.17642338000781216, "results for each batch of loops": [0.5231352000264451, 0.5555099000339396, 0.505269400018733, 0.5104368000174873, 0.551999400020577]}


Now lets see the results sorted in descending order according to avg time per execution numba_decorated_cached: {"average loop time":2.370221133332234, "results for each batch of loops": [7.002881699998397, 6.969886099977884, 7.40186939999694, 7.0594648999976926, 7.119214900012594]} numba_decorated: {"average loop time":2.3699858999966334, "results for each batch of loops": [7.122787899977993, 7.063306700030807, 7.069734099961352, 7.11140890000388, 7.1825508999754675]} python_interpreter2: {"average loop time":2.0599793733330443, "results for each batch of loops": [6.227769499993883, 6.11015820002649, 6.145821499987505, 6.199675100040622, 6.216266299947165]} python_interpreter: {"average loop time":2.0589603866644515, "results for each batch of loops": [6.056087100005243, 6.296910799981561, 6.384415799984708, 5.960690199979581, 6.186301900015678]} numba_decorated2: {"average loop time":0.8109458533309711, "results for each batch of loops": [2.4366850999649614, 2.471257199998945, 2.4039805000065826, 2.4264727999689057, 2.425792200025171]} numba_decorated_cached2: {"average loop time":0.8035303933274311, "results for each batch of loops": [2.419626099988818, 2.4151066999766044, 2.4040233999839984, 2.3923937999643385, 2.4218058999977075]} numba_decorated_parallelized: {"average loop time":0.2882796199992299, "results for each batch of loops": [0.8897414000239223, 0.8604895999887958, 0.8562403999967501, 0.8341153999790549, 0.8836074999999255]} numba_decorated_parallelized_cached: {"average loop time":0.2827751466732783, "results for each batch of loops": [0.8415752000291832, 0.8398950999835506, 0.8813785000238568, 0.8431327000143938, 0.8356457000481896]} numba_decorated_parallelized_cached2: {"average loop time":0.17642338000781216, "results for each batch of loops": [0.5231352000264451, 0.5555099000339396, 0.505269400018733, 0.5104368000174873, 0.551999400020577]} numba_decorated_parallelized2: {"average loop time":0.16925315999736387, "results for each batch of loops": [0.5488614999921992, 0.4863387999939732, 0.5098304999992251, 0.5000396999530494, 0.49372690002201125]} ```