bayesian-case-study/basic_distributions.py at main · AlexFiliakov/bayesian-case-study · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
import logging
import os
from collections import Counter
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import psutil
from scipy.optimize import minimize
from scipy.special import betaln, comb
from scipy.stats import binom, nbinom, poisson


def fit_uniform(y):
    """
    Fit a Uniform distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood_uniform : float
        Log-likelihood of the Uniform model.
    n_fitted : float
        Fitted parameter for the Uniform distribution.
    bic_uniform : float
        Bayesian Information Criterion (BIC) of the Uniform model.
    uniform_pmf : function
        Probability mass function of the Uniform distribution.
    neg_log_likelihood_uniform : function
        Negative log-likelihood function of the Uniform distribution.
    """

    # Define the probability mass function (pmf) of the Uniform distribution
    def uniform_pmf(k, n):
        k = np.asarray(k)  # Ensure k is a numpy array
        return [1 / n if j <= n else 0 for j in k]

    # Define the negative log-likelihood function for the Uniform distribution
    def neg_log_likelihood_uniform(params, data):
        n = params[0]  # Assuming n is the number of trials in the data
        pmf_values = uniform_pmf(data, n)
        # Avoid log(0) by replacing 0 with a very small number
        pmf_values = np.where(np.array(pmf_values) == 0, 1e-10, pmf_values)
        return -np.sum(np.log(pmf_values))

    n_range = range(1,np.max(y)*10)

    # Grid search for the best n
    best_n = None
    best_neg_log_likelihood = np.inf

    for n in n_range:
        # Initial guess for n
        initial_guess = np.array([n])
        # Minimize the negative log-likelihood
        result = minimize(neg_log_likelihood_uniform, initial_guess, args=(y,), bounds=[(1e-5, None)])

        if result.fun < best_neg_log_likelihood:
            best_neg_log_likelihood = result.fun
            best_n = result.x[0]

    # Extract the fitted parameter
    n_fitted = best_n

    # Calculate the log-likelihood of the model
    log_likelihood_uniform = -neg_log_likelihood_uniform([n_fitted], y)

    # Number of parameters (k) and number of data points (n)
    k = 1
    n = len(y)

    # Calculate BIC
    bic_uniform = k * np.log(n) - 2 * log_likelihood_uniform

    return log_likelihood_uniform, n_fitted, bic_uniform, uniform_pmf, neg_log_likelihood_uniform


def fit_poi(y):
    """
    Fit a Poisson distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood_poi : float
        Log-likelihood of the Poisson model.
    λ : float
        Fitted parameter for the Poisson distribution.
    bic_poi : float
        Bayesian Information Criterion (BIC) of the Poisson model.
    neg_log_likelihood_poi : function
        Negative log-likelihood function of the Poisson distribution
    """
    λ = np.mean(y)  # MLE for λ in Poisson distribution

    #Define the negative log-likelihood function for the Poisson distribution
    def neg_log_likelihood_poi(λ, data):
        return -np.sum(poisson.logpmf(data, λ))

    # Calculate the log-likelihood of the Poisson model
    log_likelihood_poi = -neg_log_likelihood_poi(λ, y)

    # Number of parameters (k) and number of data points (n)
    k = 1
    n = len(y)

    # Calculate the Bayesian Information Criterion (BIC)
    bic_poi = k * np.log(n) - 2 * log_likelihood_poi

    return log_likelihood_poi, λ, bic_poi, neg_log_likelihood_poi

# Define the negative log-likelihood function for the Binomial model
def neg_log_likelihood_binom(params, y):
    n, p = params # Unpack the parameters
    pmf_values = binom.pmf(y, n, p)
    # Avoid log(0) by replacing 0 with a very small number
    pmf_values = np.where(np.array(pmf_values) == 0, 1e-10, pmf_values)
    result = -np.sum(np.log(pmf_values))
    return result

def fit_binom(y):
    """
    Fit a Binomial distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood_binom : float
        Log-likelihood of the Binomial model.
    n_fitted_binom : float
        Fitted parameter for the Binomial distribution.
    p_fitted_binom : float
        Fitted parameter for the Binomial distribution.
    bic_binom : float
        Bayesian Information Criterion (BIC) of the Binomial model.
    n_grid : array_like
        Grid of fitted n values.
    p_grid : array_like
        Grid of fitted p values.
    log_likelihood_grid : array_like
        Grid of fitted log-likelihood values.
    neg_log_likelihood_binom : function
        Negative log-likelihood function of the Binomial distribution
    """

    # Define the grid of parameter ranges
    n_trials_range = np.arange(0, np.max(y)*200 + 1, 5)
    p_range = np.linspace(1e-5, 1 - 1e-5, 100)

    # Vectorized grid search for the best n and p
    n_grid, p_grid = np.meshgrid(n_trials_range, p_range, indexing='ij')
    log_likelihood_grid = np.zeros_like(n_grid, dtype=float)

    for i in range(n_grid.shape[0]):
        for j in range(n_grid.shape[1]):
            n = n_grid[i, j]
            p = p_grid[i, j]
            log_likelihood_grid[i, j] = -neg_log_likelihood_binom((np.array([n]),np.array([p])), np.array(y))
            pass

    # Mask out np.inf values by setting them to a very large number
    log_likelihood_grid[np.isinf(log_likelihood_grid)] = 1e10

    # Find the indices of the minimum negative log-likelihood
    min_idx = np.unravel_index(np.argmax(log_likelihood_grid), log_likelihood_grid.shape)
    n_fitted_binom = n_grid[min_idx]
    p_fitted_binom = p_grid[min_idx]
    best_neg_log_likelihood = log_likelihood_grid[min_idx]

    # Calculate the log-likelihood
    log_likelihood_binom = best_neg_log_likelihood

    # Number of parameters (k)
    k = 2

    # Number of data points (n)
    n = len(y)

    # Calculate BIC
    bic_binom = k * np.log(n) - 2 * log_likelihood_binom

    return log_likelihood_binom, n_fitted_binom, p_fitted_binom, bic_binom, n_grid, p_grid, log_likelihood_grid
    # Define the negative log-likelihood function for the Negative Binomial distribution

def neg_log_likelihood_nb(params, y):
    r, p = params # Unpack the parameters
    pmf_values = nbinom.pmf(y, r, p)
    # Avoid log(0) by replacing 0 with a very small number
    pmf_values = np.where(np.array(pmf_values) == 0, 1e-10, pmf_values)
    result = -np.sum(np.log(pmf_values))
    return result

# Function to perform optimization for a given r
def optimize_for_r(i, r, y, p_range):
    best_p = None
    best_neg_log_likelihood = np.inf
    local_log_likelihood_matrix = np.zeros(len(p_range))
    for j, p in enumerate(p_range):
        initial_guess = np.array([r, p])
        result = neg_log_likelihood_nb(initial_guess, y)
        # result = minimize(neg_log_likelihood_nb, initial_guess, args=(y,), bounds=[(1e-5, None), (1e-5, 1-1e-5)])

        # Store the results in the local matrix
        local_log_likelihood_matrix[j] = result

        if result < best_neg_log_likelihood:
            best_neg_log_likelihood = result
            best_p = p
    return i, local_log_likelihood_matrix, r, best_p, best_neg_log_likelihood

def fit_nb(y, max_workers=None):
    """
    Fit a Negative Binomial distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood_nb : float
        Log-likelihood of the Negative Binomial model.
    r_nb_fitted : float
        Fitted parameter for the Negative Binomial distribution.
    p_nb_fitted : float
        Fitted parameter for the Negative Binomial distribution.
    bic_nb : float
        Bayesian Information Criterion (BIC) of the Negative Binomial model.
    r_range : array_like
        Grid of fitted r values.
    p_range : array_like
        Grid of fitted p values.
    log_likelihood_matrix : array_like
        Grid of fitted log-likelihood values.
    neg_log_likelihood_nb : function
        Negative log-likelihood function of the Negative Binomial distribution
    """
    # Define the grid of parameter ranges
    r_range = np.linspace(1, np.max(y) + 1, 100)  # Adjusted range for r
    p_range = np.linspace(1e-5, 1 - 1e-5, 1000)  # Adjusted range for p

    # Initialize an empty matrix to store the results
    log_likelihood_matrix = np.zeros((len(r_range), len(p_range)))

    # If no max_workers provides, determine the number of workers based on CPU utilization
    if max_workers is None:
        # Get the utilization of each core
        core_utilizations = psutil.cpu_percent(percpu=True)

        # Count the number of cores with utilization under 75%
        underutilized_cores = sum(1 for utilization in core_utilizations if utilization < 75)

        # Set n_jobs to the minimum of the number of underutilized cores and a specified maximum (e.g., 4)
        max_jobs = os.cpu_count() - 1
        max_workers = min(underutilized_cores, max_jobs)

    results = []

    if max_workers == 1: # Don't parallelize if only 1 worker
        for i, r in enumerate(r_range):
            results.append(optimize_for_r(i, r, y, p_range))
    else: # Parallel processing to optimize for each r
        with ProcessPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(optimize_for_r, i, r, y, p_range) for i, r in enumerate(r_range)]

            for i, future in enumerate(as_completed(futures)):
                try:
                    result = future.result(timeout=60)
                    if result is not None:
                        results.append(result)
                except Exception as e:
                    logging.info(f"Error in Neg Bin fit {i}: {e}")

    # Update the global log_likelihood_matrix with results from parallel processing
    for result in results:
        i, local_log_likelihood_matrix, r, best_p, best_neg_log_likelihood = result
        log_likelihood_matrix[i, :] = local_log_likelihood_matrix

    # Find the best parameters from the results
    best_result = min(results, key=lambda x: x[4])
    best_r, best_p, best_neg_log_likelihood = best_result[2], best_result[3], best_result[4]

    r_nb_fitted = best_r
    p_nb_fitted = best_p

    # Calculate the log-likelihood
    log_likelihood_nb = best_neg_log_likelihood

    # Number of parameters (k) and number of data points (n)
    k = 2
    n = len(y)

    # Calculate BIC
    bic_nb = k * np.log(n) + 2 * log_likelihood_nb

    return log_likelihood_nb, r_nb_fitted, p_nb_fitted, bic_nb, r_range, p_range, log_likelihood_matrix

def fit_beta_binom(y):
    """
    Fit a Beta-Binomial distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood_bb : float
        Log-likelihood of the Beta-Binomial model.
    alpha_bb_fitted : float
        Fitted parameter for the Beta-Binomial distribution.
    beta_bb_fitted : float
        Fitted parameter for the Beta-Binomial distribution.
    bic_bb : float
        Bayesian Information Criterion (BIC) of the Beta-Binomial model.
    beta_binomial_pmf : function
        Probability mass function of the Beta-Binomial distribution.
    neg_log_likelihood_bb : function
        Negative log-likelihood function of the Beta-Binomial
    """
    # Define the Beta-Binomial PMF
    def beta_binomial_pmf(k, n, alpha, beta):
        k = np.asarray(k)  # Ensure k is a numpy array
        binom_coeff = comb(n, k)  # Calculate the binomial coefficient
        pmf = binom_coeff * np.exp(betaln(k + alpha, n - k + beta) - betaln(alpha, beta))
        return pmf

    # Define the negative log-likelihood function for the Beta-Binomial distribution
    def neg_log_likelihood_bb(params, data):
        alpha, beta = params
        n = max(data)  # Assuming n is the maximal value in the data
        return -np.sum(np.log(beta_binomial_pmf(data, n, alpha, beta)))

    # Initial guesses for alpha and beta
    initial_params = [1, 1]

    # Fit the Beta-Binomial distribution to the data
    model_bb = minimize(neg_log_likelihood_bb, initial_params, args=(y,), bounds=[(1e-5, None), (1e-5, None)])

    # Extract the fitted parameters
    alpha_bb_fitted, beta_bb_fitted = model_bb.x

    # Calculate the log-likelihood of the model
    log_likelihood_bb = -neg_log_likelihood_bb((alpha_bb_fitted, beta_bb_fitted), y)

    # Number of parameters (k) and number of data points (n)
    k = 2
    n = len(y)

    # Calculate BIC
    bic_bb = k * np.log(n) - 2 * log_likelihood_bb

    return log_likelihood_bb, alpha_bb_fitted, beta_bb_fitted, bic_bb, beta_binomial_pmf, neg_log_likelihood_bb


def fit_empirical(y):
    """
    Fit an Empirical distribution to the data y using maximum likelihood estimation.

    Parameters
    ----------
    y : array_like of shape (n_samples,): Data to fit.

    Returns
    -------
    log_likelihood : float
        Log-likelihood of the Empirical model.
    probabilities : dict
        Dictionary containing the probabilities of each unique value in the data.
    bic_emp : float
        Bayesian Information Criterion (BIC) of the Empirical model.
    predictive_distribution : function
        Predictive distribution of the Empirical distribution.
    neg_log_likelihood_emp : function
        Negative log-likelihood function of the Empirical distribution.
    """
    # Calculate the frequency of each value in the observed data
    frequency_counts = Counter(y)

    # Normalize the frequencies to get probabilities
    total_count = sum(frequency_counts.values())
    probabilities = {k: v / total_count for k, v in frequency_counts.items()}

    # Create the predictive distribution
    def predictive_distribution(value):
        return probabilities.get(value, 0)

    # Define the negative log-likelihood function for the Empirical distribution
    def neg_log_likelihood_emp(data):
        return -np.sum([np.log(predictive_distribution(val)) for val in data])

    # Calculate the log-likelihood of the observed data
    log_likelihood = -neg_log_likelihood_emp(y)

    # Number of parameters (k) and number of data points (n)
    k = len(y)
    n = len(y)

    # Calculate BIC
    bic_emp = k * np.log(n) - 2 * log_likelihood

    return log_likelihood, probabilities, bic_emp, predictive_distribution, neg_log_likelihood_emp


# Configure logging to save to a file
logging.basicConfig(filename='output.log', level=logging.INFO)