A better example of a profiler

pevnak · pevnak · commit 34deb3b0fba8 · 2023-01-13T22:11:30.000+01:00
diff --git a/docs/src/lecture_11/lecture.md b/docs/src/lecture_11/lecture.md
@@ -184,7 +184,7 @@ is about `315` μs, which still 160x faster.
 	    NVTX.@range "julia set" juliaset_pixel.(cis, cjs, n);
     end
     ```
-    for better orientation in the code. Note that if nvtx information does not show up in the trace we have to add it to the tracing running the profiler with `--trace=cuda,nvtx`.
+    for better orientation in the code. Note that if nvtx information does not show up in the trace we have to add it to the tracing running the profiler with `--trace=cuda,nvtx`. [for more sophisticated example click here](profile_nn.jl)
     Lastly it is recommended to run a kernel twice in a profile trace as the first execution of the kernel in a profiler incurs some overhead, even though the code has been already compiled.    
 
 In the output of the profiler we see that there is a lot of overhead caused by launching the kernel itself and then, the execution is relatively fast. 
diff --git a/docs/src/lecture_11/profile_nn.jl b/docs/src/lecture_11/profile_nn.jl
@@ -0,0 +1,40 @@
+using CUDA
+
+# define a dense layer
+struct Dense{W<:AbstractArray,B<:AbstractArray,F}
+	w::W 
+	b::B
+	f::F
+end 
+
+function Dense(idim, odim, f = identity)
+	Dense(randn(Float32, odim, idim), randn(Float32, odim), f)
+end
+
+function (l::Dense)(x)
+	l.f.(l.w * x .+ l.b)
+end
+
+#define moving of data to CPU
+gpu(x::AbstractArray) = CuArray(x)
+cpu(x::CuArray) = Array(x)
+gpu(l::Dense) = Dense(gpu(l.w), gpu(l.b), l.f)
+gpu(l::ComposedFunction) = gpu(l.outer) ∘ gpu(l.inner)
+
+# a simple but powerful non-linearity
+relu(x::T) where {T<:Number} = max(x, zero(T))
+
+
+# Let's now define a small one hidden layer neural network
+x = randn(Float32, 16, 100)
+l₁ = Dense(16,32, relu)
+l₂ = Dense(32,8)
+nn = l₂ ∘ l₁
+
+# and try to profile a computation
+CUDA.@profile CUDA.@sync begin 
+    NVTX.@range "moving nn to gpu" gpu_nn = gpu(nn)
+    NVTX.@range "moving x to gpu" gpu_x = gpu(x)
+    NVTX.@range "nn(x)" o = gpu_nn(gpu_x)
+    NVTX.@range "moving results to cpu" cpu(o)
+end