@@ -13,10 +13,10 @@ def tune():
1313 tune_params ["block_size_x" ] = [2 ** i for i in range (5 ,11 )]
1414 tune_params ["use_shuffle" ] = [0 , 1 ]
1515 tune_params ["vector" ] = [2 ** i for i in range (3 )]
16- tune_params ["num_blocks" ] = [2 ** i for i in range (5 ,11 )]
16+ tune_params ["num_blocks" ] = [2 ** i for i in range (5 ,16 )]
1717
1818 problem_size = "num_blocks"
19- size = 80000000
19+ size = 800000000
2020 max_blocks = max (tune_params ["num_blocks" ])
2121
2222 x = numpy .random .rand (size ).astype (numpy .float32 )
@@ -43,6 +43,7 @@ def verify_partial_reduce(cpu_result, gpu_result, atol=None):
4343 tune_params ["num_blocks" ] = [1 ]
4444 second_kernel = dict ()
4545 for nblocks in num_blocks :
46+ print ('nblocks:' , nblocks )
4647 #change the input size to nblocks
4748 args = [sum_x , x , numpy .int32 (nblocks )]
4849 #tune the second kernel with n=nblocks
@@ -58,8 +59,11 @@ def verify_partial_reduce(cpu_result, gpu_result, atol=None):
5859 for i , instance in enumerate (first_kernel ):
5960 first_kernel [i ]["total" ] = instance ["time" ] + second_kernel [instance ["num_blocks" ]]["time" ]
6061
62+ first_config = min (first_kernel , key = lambda x :x ['time' ])
6163 best_config = min (first_kernel , key = lambda x :x ['total' ])
6264
65+ print ("Best performing config first kernel only: \n " + get_config_string (first_config ))
66+
6367 print ("Best performing config: \n " + get_config_string (best_config ))
6468 print ("uses the following config for the secondary kernel:" )
6569 print (get_config_string (second_kernel [best_config ["num_blocks" ]]))
0 commit comments