Skip to content

Commit 3b03ff9

Browse files
Merge branch 'dev'
2 parents 20c71fd + c762047 commit 3b03ff9

23 files changed

+418
-97
lines changed

DESCRIPTION

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: ccdrAlgorithm
22
Title: CCDr Algorithm for Learning Sparse Gaussian Bayesian Networks
3-
Version: 0.0.2
4-
Date: 2016-11-19
3+
Version: 0.0.3
4+
Date: 2017-03-09
55
Authors@R: c(
66
person("Bryon", "Aragam", email = "sparsebn@gmail.com", role = c("aut", "cre")),
77
person("Dacheng", "Zhang", role = c("aut"))
@@ -11,13 +11,16 @@ Description: Implementation of the CCDr (Concave penalized Coordinate Descent wi
1111
Depends:
1212
R (>= 3.2.3)
1313
Imports:
14-
sparsebnUtils (>= 0.0.2),
15-
Rcpp (>= 0.11.0)
14+
sparsebnUtils (>= 0.0.4),
15+
Rcpp (>= 0.11.0),
16+
stats,
17+
utils
1618
LinkingTo: Rcpp
1719
Suggests:
1820
testthat,
19-
graph
21+
graph,
22+
igraph
2023
URL: https://github.com/itsrainingdata/ccdrAlgorithm
2124
BugReports: https://github.com/itsrainingdata/ccdrAlgorithm/issues
2225
License: GPL (>= 2)
23-
RoxygenNote: 5.0.1
26+
RoxygenNote: 6.0.1

NAMESPACE

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,21 @@
33
S3method(edgeList,SparseBlockMatrixR)
44
S3method(sparse,SparseBlockMatrixR)
55
export(ccdr.run)
6+
export(generate_mvn_data)
67
importFrom(Rcpp,sourceCpp)
8+
importFrom(sparsebnUtils,as.sparse)
79
importFrom(sparsebnUtils,edgeList)
810
importFrom(sparsebnUtils,get.adjacency.matrix)
11+
importFrom(sparsebnUtils,is.edgeList)
12+
importFrom(sparsebnUtils,is.sparse)
913
importFrom(sparsebnUtils,is.zero)
1014
importFrom(sparsebnUtils,num.edges)
1115
importFrom(sparsebnUtils,num.nodes)
1216
importFrom(sparsebnUtils,reIndexC)
1317
importFrom(sparsebnUtils,reIndexR)
1418
importFrom(sparsebnUtils,sparse)
1519
importFrom(sparsebnUtils,to_graphNEL)
16-
useDynLib(ccdrAlgorithm)
20+
importFrom(sparsebnUtils,to_igraph)
21+
importFrom(stats,rnorm)
22+
importFrom(utils,tail)
23+
useDynLib(ccdrAlgorithm, .registration = TRUE)

NEWS.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
# ccdrAlgorithm 0.0.3
2+
3+
## Features
4+
5+
* New `generate_mvn_data()` method to generate multivariate normal data from a DAG.
6+
7+
## Notes
8+
9+
* Added warning when dataset contains more than 10,000 columns: This requires building from source. The CCDr algorithm has been safely tested on datasets with up to 8,000 variables.
10+
* By default, `ccdr.run()` includes the node names in the `sparsebnPath` output.
11+
112
# ccdrAlgorithm 0.0.2
213

314
## Features

R/ccdrAlgorithm-functions.R

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,11 @@
1+
#
2+
# ccdrAlgorithm-functions.R
3+
# ccdrAlgorithm
4+
#
5+
# Created by Dacheng Zhang on 9/3/16.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
7+
#
8+
19
## returns TRUE if ivn_list is a list of vectors or NULL elements,
210
check_if_ivn_list <- function(ivn) {
311
## check if it is a list
@@ -24,7 +32,7 @@ check_vector_label <- function(vec, pp) {
2432
## e.g.: c(NA, 1L, NA, 3L, NA, 5L)
2533
## However, c(1L, NA, 3L, 4, NA) returns all FALSE
2634
## check if labels are integers
27-
if(any(is.na(vec)) || !is.integer(vec)) {
35+
if(any(is.na(vec)) || !is.numeric(vec)) {
2836
stop("Non-integer label(s) found in one or more components in ivn.")
2937
return(FALSE)
3038
}

R/ccdrAlgorithm-main.R

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#
2-
# ccdr-main-R.R
2+
# ccdrAlgorithm-main.R
33
# ccdrAlgorithm
44
#
55
# Created by Bryon Aragam (local) on 1/22/16.
6-
# Copyright (c) 2014-2016 Bryon Aragam. All rights reserved.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
77
#
88

99
#
@@ -17,7 +17,7 @@
1717
#
1818

1919
###--- These two lines are necessary to import the auto-generated Rcpp methods in RcppExports.R---###
20-
#' @useDynLib ccdrAlgorithm
20+
#' @useDynLib ccdrAlgorithm, .registration = TRUE
2121
#' @importFrom Rcpp sourceCpp
2222
NULL
2323

@@ -58,25 +58,22 @@ NULL
5858
#'
5959
#' @examples
6060
#'
61-
#' \dontrun{
62-
#'
6361
#' ### Generate some random data
6462
#' dat <- matrix(rnorm(1000), nrow = 20)
65-
#' dat <- sparsebnData(dat, type = "continuous")
63+
#' dat <- sparsebnUtils::sparsebnData(dat, type = "continuous")
6664
#'
6765
#' # Run with default settings
68-
#' ccdr.run(data = dat)
66+
#' ccdr.run(data = dat, lambdas.length = 20)
6967
#'
7068
#' ### Optional: Adjust settings
71-
#' pp <- ncol(dat)
69+
#' pp <- ncol(dat$data)
7270
#'
7371
#' # Initialize algorithm with a random initial value
7472
#' init.betas <- matrix(0, nrow = pp, ncol = pp)
7573
#' init.betas[1,2] <- init.betas[1,3] <- init.betas[4,2] <- 1
7674
#'
7775
#' # Run with adjusted settings
78-
#' ccdr.run(data = dat, betas = init.betas, lambdas.length = 10, alpha = 10, verbose = TRUE)
79-
#' }
76+
#' ccdr.run(data = dat, betas = init.betas, lambdas.length = 20, alpha = 10, verbose = TRUE)
8077
#'
8178
#' @export
8279
ccdr.run <- function(data,
@@ -111,6 +108,9 @@ ccdr.run <- function(data,
111108
verbose = verbose)
112109
} # END CCDR.RUN
113110

111+
### Maximum number of nodes allowed
112+
MAX_CCS_ARRAY_SIZE <- function() 10000
113+
114114
# ccdr_call
115115
#
116116
# Handles most of the bookkeeping for CCDr. Sets default values and prepares arguments for
@@ -153,6 +153,10 @@ ccdr_call <- function(data,
153153
nn <- as.integer(nrow(data))
154154
pp <- as.integer(ncol(data))
155155

156+
if(pp > MAX_CCS_ARRAY_SIZE()){
157+
stop(max_nodes_warning(pp))
158+
}
159+
156160
if(is.null(ivn)) ivn <- vector("list", nn) # to pass testthat for observational data cases
157161
### Check ivn
158162
if(!check_if_ivn_list(ivn)) stop("ivn must be a list of NULLs or vectors!")
@@ -216,6 +220,7 @@ ccdr_call <- function(data,
216220
# Still need to set start = 0, though.
217221
betas$start <- 0
218222
} # Type-checking for betas happens in ccdr_singleR
223+
219224
# This parameter can be set by the user, but in order to prevent the algorithm from taking too long to run
220225
# it is a good idea to keep the threshold used by default which is O(sqrt(pp))
221226
if(is.null(max.iters)){
@@ -252,6 +257,7 @@ ccdr_call <- function(data,
252257
### Coerce sbm output to edgeList
253258
names(fit[[k]])[1] <- "edges" # rename 'sbm' slot to 'edges': After the next line, this slot will no longer be an SBM object
254259
fit[[k]]$edges <- sparsebnUtils::as.edgeList(fit[[k]]$edges) # Before coercion, li$edges is actually an SBM object
260+
names(fit[[k]]$edges) <- names(data)
255261

256262
### Add node names to output
257263
fit[[k]] <- append(fit[[k]], list(names(data)), after = 1) # insert node names into second slot

R/ccdrAlgorithm-messages.R

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#
2+
# ccdrAlgorithm-messages.R
3+
# ccdrAlgorithm
4+
#
5+
# Created by Bryon Aragam (local) on 11/20/16.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
7+
#
8+
9+
#
10+
# PACKAGE CCDRALGORITHM: Messages
11+
#
12+
# CONTENTS:
13+
# max_nodes_warning
14+
#
15+
16+
### These warnings are all internal to this package and hence
17+
### do not need to be exported
18+
19+
### User inputs invalid data object
20+
max_nodes_warning <- function(numnode){
21+
msg <- "This dataset contains more than %d variables -- in order to
22+
run CCDr on this dataset, please download the source, increase
23+
_MAX_CCS_ARRAY_SIZE_ to at least %d, and re-build the package
24+
from source. If you have any trouble, please contact the
25+
maintainer."
26+
stop(sprintf(msg, MAX_CCS_ARRAY_SIZE(), numnode))
27+
}

R/ccdrAlgorithm-mvn.R

Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
#
2+
# ccdrAlgorithm-mvn.R
3+
# ccdrAlgorithm
4+
#
5+
# Created by Bryon Aragam (local) on 1/15/17.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
7+
#
8+
9+
#' Generate data from a DAG
10+
#'
11+
#' Given a Gaussian DAG, generate data from the underlying distribution.
12+
#' Equivalently, generate data from a multivariate normal distribution given
13+
#' one of its SEM. Can generate both observational and intervention data.
14+
#'
15+
#' If \code{ivn = NULL}, then \code{n} observational samples are drawn. For each
16+
#' component of \code{ivn} that is not \code{NULL}, interventional samples will
17+
#' be drawn with the values of each node specified in the component.
18+
#'
19+
#' @param graph DAG in \code{\link{edgeList}} format.
20+
#' @param params Vector of parameters. Last p elements correspond to variances (p = number of nodes in \code{graph}), initial elements correspond to edge weights.
21+
#' @param n Number of samples to draw.
22+
#' @param ivn List of interventions (see \code{\link[sparsebnUtils]{sparsebnData}}). Must be a \code{list} with exactly \code{n} components.
23+
#' @param ivn.rand If \code{TRUE}, random N(0,1) values will be drawn for each intervention. Otherwise, these values need to supplied manually in \code{ivn}.
24+
#'
25+
#' @examples
26+
#'
27+
#' ### Generate observational data
28+
#' gr <- sparsebnUtils::random.graph(5, 5) # use sparsebnUtils package to generate a random graph
29+
#' gr.params <- runif(10) # there are 5 coefficients + 5 variances
30+
#' data.obs <- ccdrAlgorithm::generate_mvn_data(graph = gr,
31+
#' n = 100,
32+
#' params = gr.params)
33+
#'
34+
#' ### Generate experimental data
35+
#' ivn <- as.list(c(rep("V1", 50), rep("V2", 50))) # 50 interventions on V1, 50 interventions on V2
36+
#' data.ivn <- ccdrAlgorithm::generate_mvn_data(graph = gr,
37+
#' n = 100,
38+
#' params = gr.params,
39+
#' ivn = ivn)
40+
#'
41+
#' @export
42+
generate_mvn_data <- function(graph, params, n = 1, ivn = NULL, ivn.rand = TRUE){
43+
### This function requires the 'igraph' package to be installed
44+
if (!requireNamespace("igraph", quietly = TRUE)) {
45+
stop("The igraph package is required for the method 'generate_mvn_data'. Please install it using install.packages(\"igraph\").", call. = FALSE)
46+
}
47+
48+
stopifnot(sparsebnUtils::is.edgeList(graph))
49+
stopifnot(is.numeric(params))
50+
stopifnot(length(params) == sparsebnUtils::num.edges(graph) + sparsebnUtils::num.nodes(graph))
51+
52+
if(is.null(names(graph))){
53+
stop("Input 'graph' requires node names!")
54+
}
55+
56+
if(!is.null(ivn)){
57+
stopifnot(is.list(ivn))
58+
stopifnot(length(ivn) == n)
59+
60+
### Generate random intervention values
61+
if(ivn.rand){
62+
ivn <- lapply(ivn, function(x) sapply(x, function(x) rnorm(n = 1, mean = 0, sd = 1))) # assume standard normal
63+
# ivn <- lapply(ivn, function(x) sapply(x, function(x) 1)) # debugging
64+
}
65+
}
66+
67+
### Need this to ensure the output has the same order as the input
68+
### after things get shuffled around
69+
original_node_order <- names(graph)
70+
71+
### Get topological sort
72+
### Note that the check for the igraph pkg occurs in sparsebnUtils::to_igraph
73+
topsort <- names(igraph::topo_sort(sparsebnUtils::to_igraph(graph)))
74+
75+
nnode <- length(original_node_order)
76+
vars <- utils::tail(params, nnode) # parameters associated with variances
77+
names(vars) <- original_node_order
78+
coefs <- params[1:(length(params) - nnode)] # parameters associated with edge weights
79+
sp <- sparsebnUtils::as.sparse(graph)
80+
sp$vals <- coefs # previous line leaves NAs for values in sparse object; need to fill these in
81+
edgelist <- sparse_to_edgeWeightList(sp, original_node_order)
82+
nodes <- names(edgelist) # this will be sorted according to the topological order
83+
84+
### The old way, efficient for obs data only
85+
# x <- replicate(n, generate_mvn_vector(edgelist, nodes, topsort, vars))
86+
# x <- t(x)[, original_node_order]
87+
88+
x <- vector("list", length = n)
89+
for(i in 1:n){
90+
x[[i]] <- generate_mvn_vector(edgelist, nodes, topsort, vars, ivn = ivn[[i]])
91+
}
92+
x <- do.call("rbind", x)
93+
94+
### Permute columns back to original ordering
95+
x <- x[, original_node_order]
96+
x
97+
}
98+
99+
generate_mvn_vector <- function(edgelist, nodes, topsort, vars = NULL, ivn = NULL){
100+
normal_seed <- sapply(vars, function(x) rnorm(n = 1, mean = 0, sd = sqrt(x)))
101+
gen_dag_vector_R(edgelist, nodes, topsort, seed = normal_seed, ivn = ivn)
102+
}
103+
104+
#
105+
# edgelist = graph information
106+
# nodes = names of nodes in graph
107+
# topsort = topological sort (indexed by node names)
108+
# seed = random noise (Gaussian); bias term (binary)
109+
# ivn = named vector of intervention values (do(child = x))
110+
#
111+
gen_dag_vector_R <- function(edgelist, nodes, topsort, seed, ivn = NULL){
112+
nnode <- length(edgelist)
113+
x <- numeric(nnode)
114+
names(x) <- nodes
115+
ivnnames <- names(ivn)
116+
117+
for(j in seq_along(topsort)){
118+
child <- topsort[j]
119+
120+
if(child %in% ivnnames){
121+
### If node is intervened on, fix value according to input in 'ivn'
122+
x[child] <- ivn[child]
123+
} else{
124+
### If no intervention, use DAG to determine value from parents
125+
parents <- edgelist[[child]]$parents
126+
weights <- edgelist[[child]]$weights
127+
nparents <- length(parents)
128+
if(nparents > 0){
129+
### Iterate over parents and add associated effects
130+
for(i in seq_along(parents)){
131+
this.par <- parents[i]
132+
x[child] <- x[child] + weights[i] * x[this.par]
133+
# x[child] <- x[child] + weights[i] * x[index[i]] # equivalent to above line
134+
}
135+
}
136+
137+
### Add noise: This is a crucial step. If nothing is added here, the
138+
### output will be all zeroes since the root node(s) will
139+
### have x[child] = 0 at this point.
140+
###
141+
### Gaussian model: This is random error ~ N(0, vars[j])
142+
### Logistic model: This a (deterministic) bias term
143+
x[child] <- x[child] + seed[child]
144+
}
145+
146+
}
147+
148+
x
149+
}
150+
151+
sparse_to_edgeWeightList <- function(x, nodes){
152+
stopifnot(sparsebnUtils::is.sparse((x)))
153+
# sp <- sparsebnUtils::as.sparse(x) # NOTE: no longer a bottleneck under sparsebnUtils v0.0.4
154+
155+
# nodes <- colnames(x)
156+
stopifnot(x$dim[1] == x$dim[2])
157+
158+
out <- lapply(vector("list", length = x$dim[1]), function(z) list(parents = character(0), index = integer(0), weights = numeric(0)))
159+
names(out) <- nodes
160+
for(j in seq_along(x$cols)){
161+
child <- x$cols[[j]]
162+
parent <- x$rows[[j]]
163+
weight <- x$vals[[j]]
164+
parents <- c(out[[child]]$parents, nodes[parent]) # !!! THIS IS SLOW
165+
index <- c(out[[child]]$index, parent) # !!! THIS IS SLOW
166+
weights <- c(out[[child]]$weights, weight) # !!! THIS IS SLOW
167+
out[[nodes[child]]] <- list(parents = parents, index = index, weights = weights)
168+
}
169+
170+
out
171+
}

R/s3-SparseBlockMatrixR.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# ccdrAlgorithm
44
#
55
# Created by Bryon Aragam (local) on 1/22/16.
6-
# Copyright (c) 2014-2016 Bryon Aragam. All rights reserved.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
77
#
88

99
#------------------------------------------------------------------------------#

R/s3-generics.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# ccdrAlgorithm
44
#
55
# Created by Bryon Aragam (local) on 1/22/16.
6-
# Copyright (c) 2014-2016 Bryon Aragam. All rights reserved.
6+
# Copyright (c) 2014-2017 Bryon Aragam. All rights reserved.
77
#
88

99
#

0 commit comments

Comments
 (0)