@@ -250,7 +250,7 @@ function init_marginal_cpu(data, reuse, num_nodes; Float)
250250 nfeatures = num_features (data)
251251 for i= 1 : nfeatures
252252 values[:, 2 + i] .= log .(coalesce .(data[:,i], one (Float)))
253- values[:, 2 + nfeatures+ i] .= log .(coalesce .(1.0 .- data[:,i], one (Float)))
253+ values[:, 2 + nfeatures+ i] .= log .(coalesce .(one (Float) .- data[:,i], one (Float)))
254254 end
255255 return values
256256end
@@ -267,26 +267,26 @@ function init_marginal_gpu(data, reuse, num_nodes; Float=Float32)
267267 @views values[:, LogicCircuits. FALSE_BITS] .= log (zero (Float))
268268
269269 nfeatures = num_features (data)
270- num_data = size (values, 1 )
270+
271+ # ## option 2 - still slow
271272
272- # Option 1; not possible rn cause cannot pass datafame to cuda kernel
273- # kernel = @cuda name="init_marginal_cuda" launch=false init_marginal_cuda(values, data, nfeatures)
274- # config = launch_configuration(kernel.fun)
275- # threads, blocks = balance_threads_2d(num_data, nfeatures, config.threads)
276- # kernel(values, data, nfeatures; threads, blocks)
277273
278-
279- # # option 2 - still slow
280- for i= 1 : nfeatures
281- @views values[:, 2 + i] .= log .(coalesce .(data[:, i], one (Float)))
282- @views values[:, 2 + i + nfeatures] .= log .(coalesce .(1.0 .- data[:, i], one (Float)))
274+ if data[! , 1 ] isa CuBitVector
275+ # Have to do this for CuBitVector since does not play well with CuArray broadcast kernels
276+ data_cpu = to_cpu (data)
277+ for i= 1 : nfeatures
278+ @views values[:, 2 + i] .= to_gpu (log .(coalesce .(zero (Float) .+ data_cpu[! , i], one (Float))))
279+ @views values[:, 2 + nfeatures + i] .= to_gpu (log .(coalesce .(one (Float) .- data_cpu[! , i], one (Float))))
280+ end
281+ else
282+ for i= 1 : nfeatures
283+ @views values[:, 2 + i] .= log .(coalesce .(zero (Float) .+ data[! , i], one (Float)))
284+ @views values[:, 2 + nfeatures + i] .= log .(coalesce .(one (Float) .- data[! , i], one (Float)))
285+ end
283286 end
284287
285- # Option 3 - very slow
286- # TODO ;;; here we should use a custom CUDA kernel to extract Float marginals from bit vectors
287- # for now the lazy solution is to move everything to the CPU and do the work there...
288+ # Option 1 - very slow
288289 # data_cpu = to_cpu(data)
289- # nfeatures = num_features(data)
290290 # for i=1:nfeatures
291291 # marg_pos::Vector{Float} = log.(coalesce.(data_cpu[:,i], one(Float)))
292292 # marg_neg::Vector{Float} = log.(coalesce.(1.0 .- data_cpu[:,i], one(Float)))
0 commit comments