better marginal_init; also using Union{Missing, T}

khosravipasha · khosravipasha · commit d7788a7377b0 · 2021-11-29T20:31:24.000-08:00
diff --git a/src/parameter_learn/parameters.jl b/src/parameter_learn/parameters.jl
@@ -821,9 +821,9 @@ function estimate_parameters_gpu(pbc::ParamBitCircuit, data, pseudocount; weight
 
     if data_batched
         v, f = reuse_v, reuse_f
-        if weights != nothing
+        if !isnothing(weights)
             map(zip(data, weights)) do (d, w)
-                if w != nothing
+                if !isnothing(w)
                     w = to_gpu(w)
                 end
                 v, f = marginal_flows(pbc, d, v, f; on_node = on_node, on_edge = on_edge, weights = w)
@@ -840,7 +840,7 @@ function estimate_parameters_gpu(pbc::ParamBitCircuit, data, pseudocount; weight
             nothing # Return nothing to save some time
         end
     else
-        if weights != nothing
+        if !isnothing(weights)
             weights = to_gpu(weights)
         end
         
diff --git a/src/queries/marginal_flow.jl b/src/queries/marginal_flow.jl
@@ -250,7 +250,7 @@ function init_marginal_cpu(data, reuse, num_nodes; Float)
     nfeatures = num_features(data)
     for i=1:nfeatures
         values[:, 2+i] .= log.(coalesce.(data[:,i], one(Float)))
-        values[:, 2+nfeatures+i] .= log.(coalesce.(1.0 .- data[:,i], one(Float)))
+        values[:, 2+nfeatures+i] .= log.(coalesce.(one(Float) .- data[:,i], one(Float)))
     end
     return values
 end
@@ -267,26 +267,26 @@ function init_marginal_gpu(data, reuse, num_nodes; Float=Float32)
     @views values[:, LogicCircuits.FALSE_BITS] .= log(zero(Float))
 
     nfeatures = num_features(data)
-    num_data = size(values, 1)
+    
+    # ## option 2 - still slow
 
-    # Option 1; not possible rn cause cannot pass datafame to cuda kernel
-    # kernel = @cuda name="init_marginal_cuda" launch=false init_marginal_cuda(values, data, nfeatures)
-    # config = launch_configuration(kernel.fun)
-    # threads, blocks = balance_threads_2d(num_data, nfeatures, config.threads)
-    # kernel(values, data, nfeatures; threads, blocks)
 
-    
-    ## option 2 - still slow
-    for i=1:nfeatures
-        @views values[:, 2 + i]             .= log.(coalesce.(data[:, i], one(Float)))
-        @views values[:, 2 + i + nfeatures] .= log.(coalesce.(1.0 .- data[:, i], one(Float)))
+    if data[!, 1] isa CuBitVector
+        # Have to do this for CuBitVector since does not play well with CuArray broadcast kernels
+        data_cpu = to_cpu(data)
+        for i=1:nfeatures
+            @views values[:, 2 + i]             .= to_gpu(log.(coalesce.(zero(Float) .+ data_cpu[!, i], one(Float))))
+            @views values[:, 2 + nfeatures + i] .= to_gpu(log.(coalesce.(one(Float) .- data_cpu[!, i], one(Float))))
+        end
+    else
+        for i=1:nfeatures
+            @views values[:, 2 + i]             .= log.(coalesce.(zero(Float) .+ data[!, i], one(Float)))
+            @views values[:, 2 + nfeatures + i] .= log.(coalesce.(one(Float) .- data[!, i], one(Float)))
+        end
     end
 
-    # Option 3 - very slow
-    # TODO;;; here we should use a custom CUDA kernel to extract Float marginals from bit vectors
-    # for now the lazy solution is to move everything to the CPU and do the work there...
+    # Option 1 - very slow
     # data_cpu = to_cpu(data)
-    # nfeatures = num_features(data)
     # for i=1:nfeatures
     #     marg_pos::Vector{Float} = log.(coalesce.(data_cpu[:,i], one(Float)))
     #     marg_neg::Vector{Float} = log.(coalesce.(1.0 .- data_cpu[:,i], one(Float)))