diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..55ae808 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,15 @@ +language: python + +# ===== Linux ====== +dist: xenial +python: + - 3.7 +matrix: + include: + - name: "Python 3.7 on linux" + os: linux + language: python + before_install: + - pip install -U pip + - cd Projects/project_2_packages/paddleboat + script: python -m unittest -v diff --git a/Projects/project_2_packages/paddleboat/LASSO/LASSO_depreciated.py b/Projects/project_2_packages/paddleboat/LASSO/LASSO_depreciated.py new file mode 100644 index 0000000..aa8d13e --- /dev/null +++ b/Projects/project_2_packages/paddleboat/LASSO/LASSO_depreciated.py @@ -0,0 +1,212 @@ +import pandas as pd +import numpy as np + +def get_betas(X, Y): + """Get betas (according to OLS formula)""" + betas = (np.transpose(X) * X)^(-1) * (np.transpose(X) * Y) # transpose is a numpy function + + print("Working!") + return betas + +def get_residuals(betas, X, Y): + """Get residuals (according to OLS formula)""" + y_hat = betas * X + residuals = Y - y_hat + + print("Working!") + return residuals + +def get_n(X, Y): + """Get N, check independent vs dependent variables""" + n_X = length(X) + n_Y = length(Y) + + if n_X == n_Y: + n = n_X + else: + print("Error!") + + print("Working!") + return n + +def get_ses(residuals, X, Y): + """Get SEs (according to OLS formula)""" + residuals2 = residuals^2 + XX = (np.transpose(X) * X)^(-1) # transpose is not a real function + N = get_n(X, Y) + ses = (residuals2 / (N-1)) * XX + + print("Working!") + return ses + +def get_r2(Y, X, betas): + """Get R^2""" + y_hat = X * betas + y_bar = mean(y) + + SSR = sum((y_hat - y_bar)^2) + SST = sum((y - y_bar)^2) + + r2 = SSR / SST + + print("Working!") + return r2 + +def get_sse(Y, X, betas): + """Get sum of squared errors""" + y_hat = X * beta + sse = (Y - y_hat) ** 2 + + print("Working!") + return sse + +def get_loss_function(SSE, lamb, betas): + """Get loss function""" + betas_no_intercept = betas[1:len(betas)] + loss_function = SSE + lamb * np.sum(np.abs(betas_no_intercept)) + + print("Working!") + return loss_function + +def get_coeffs_given_lambda(X, Y, lamb): + Z = # STANDARDIZED X + Y_c = # CENTERED Y + coefficients = np.linalg.inv(Z.transpose().dot(Z).values() + lamb * np.identity(X.shape[1])).dot(Z.transpose().dot(Y_c)) + return(coefficients) + +def pick_lowest_lambda(X, Y): + """Pick lowest lambda""" + lambs = range(0, 1, 100) + losses = list() + + for l in lambs: + coeffs = get_coeffs_given_lambda(X, Y, l) + SSE = get_sse(Y, X, coeffs) + loss = loss_function(SSE, l, coeffs) + losses.append(loss) + + min_loss = min(losses) + lowest_lambda = loss(min_loss_position_in_list) + + print("Working!") + return(lowest_lamb) + +def main(): + """Performs OLS, prints output to table""" + print("Working!") +import pandas as pd +import numpy as np + +def get_betas(X, Y): + ''' + Get betas (according to OLS formula) + ''' + betas = (np.transpose(X) * X)^(-1) * (np.transpose(X) * Y) # transpose is a numpy function + + print("Working!") + return betas + +def get_residuals(betas, X, Y): + ''' + Get residuals (according to OLS formula) + ''' + y_hat = betas * X + residuals = Y - y_hat + + print("Working!") + return residuals + +def get_n(X, Y): + ''' + Get N, check independent vs dependent variables + ''' + n_X = length(X) + n_Y = length(Y) + + if n_X == n_Y: + n = n_X + else: + print("Error!") + + print("Working!") + return n + +def get_ses(residuals, X, Y): + ''' + Get SEs (according to OLS formula) + ''' + residuals2 = residuals^2 + XX = (np.transpose(X) * X)^(-1) # transpose is not a real function + N = get_n(X, Y) + ses = (residuals2 / (N-1)) * XX + + print("Working!") + return ses + +def get_r2(Y, X, betas): + ''' + Get R^2 + ''' + y_hat = X * betas + y_bar = mean(y) + + SSR = sum((y_hat - y_bar)^2) + SST = sum((y - y_bar)^2) + + r2 = SSR / SST + + print("Working!") + return r2 + +def get_sse(Y, X, betas): + ''' + Get sum of squared errors''' + y_hat = X * betas + sse = (Y - y_hat) ** 2 + + print("Working!") + return sse + +def get_loss_function(SSE, lamb, betas): + ''' + Get loss function + ''' + betas_without_intercept = betas[1:length(betas)] + loss_function = SSE + lamb * sum(abs(betas_without_intercept)) + + print("Working!") + return loss_function + +def get_coefficients_given_lambda(lamb): + ''' + Get coefficients + ''' + return(coefficients) + +def pick_lowest_lamda(): + ''' + Pick lowest lambda + ''' + lambs = [0.001, 0.01, 0.1, 0.5, 1, 2, 10] + l_num = length(lam) + pred_num = X.shape[1] + losses = list(length(lamb)) + + # prepare data for enumerate + coeff_a = np.zeros((l_num, pred_num)) + + for ind, i in enumerate(lambs): + loss = loss_function(lamb) + list.append(loss) + + min_loss = min(losses) + lowest_lamb = loss(min_loss_position_in_list) + + print("Working!") + return(lowest_lamb) + +def main(): + ''' + Performs LASSO, prints output to table + ''' + print("Working!") diff --git a/Projects/project_2_packages/paddleboat/LASSO/RIDGE.py b/Projects/project_2_packages/paddleboat/LASSO/RIDGE.py new file mode 100644 index 0000000..71a223e --- /dev/null +++ b/Projects/project_2_packages/paddleboat/LASSO/RIDGE.py @@ -0,0 +1,181 @@ +import pandas as pd +import numpy as np +from sklearn import preprocessing +from tqdm import tqdm +from scipy.stats import t + + +def beta_ridge(Y, X, lamb): + """ + Compute ridge coeffs + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix (with intercept column) + lamb : Lambda value to use for L2 + + Returns + ------- + coefficients : Vector of ridge coefficients + + Note + ---- + For simplicity we use matrix inverses, + which are not computationally efficient at O(p^3). + SVD would be a more efficient approach. + """ + + Z = X.iloc[:, 1:] + Z = pd.DataFrame(preprocessing.scale(Z)) + Y_c = Y - np.mean(Y) + + left = np.linalg.inv(Z.transpose().dot(Z) + lamb * np.identity(Z.shape[1])) + right = Z.transpose().dot(Y_c) + coefficients = left.dot(right) + return(coefficients) + + +def sse(Y, X, betas): + ''' + Get sum of square errors + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + betas : Vector of estimated coefficients + + Returns + ------- + sse : Sum of square errors + ''' + + e = betas.dot(X.values.T)-Y + sse = np.sum(e**2) + return sse + + +def ridge_fit(Y, X, lamb, n_iter=100, progress_disable = False): + """ + Estimate ridge standard errors through bootstrapping + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + lamb : Lambda value to use for L2 + n_ter : Integer of number of iterations for bootstrapping + progress_disable : Disable option for tqdm progress bar + + Returns + ------- + results : Results wrapper with ridge results + coefficients = ridge coefficients from full sample + bootstrap_coeffs = ridge coefficients from bootstrapping procedure + bootstrap_coeffs_var = Coefficient variance from bootstrapping + bootstrap_coeffs_SE = Coefficient standard errors from bootstrapping + bootstrap_coeffs_t = T-stats (from bootstrapping SE) + bootstrap_coeffs_p = P-values + """ + nobs = np.shape(X)[0] + K = np.shape(X)[1] - 1 + + beta_hat_boots = np.zeros((n_iter, K)) + + for b_iter in tqdm(range(0, n_iter), disable=progress_disable): + b_index = np.random.choice(range(0, nobs), nobs, replace = True) + _Y, _X = pd.DataFrame(Y).iloc[b_index], pd.DataFrame(X).iloc[b_index] + + b_beta_hat = beta_ridge(np.array(_Y), _X, lamb) + # Saving coefficient estimates + beta_hat_boots[b_iter, :] = b_beta_hat.squeeze() + + # Estimated coefficients from bootstrapping + beta_hat_boots = pd.DataFrame(beta_hat_boots) + beta_hat_boots.index.name = 'boot_iter' + #beta_hat_boots.columns = X.columns.values.tolist() + + # Bootstrapped variance of coefficient estimates + beta_hat_boot_var = beta_hat_boots.var(axis=0) + + # Bootstrapped SE of coefficient estimates + beta_hat_boot_SE = np.sqrt(beta_hat_boot_var) + + # Bootstrapped t-stats for null that coefficient = 0 + ## note that we use the coefficient estimates from the full sample + ## but the variance from the bootstrapping procedure + beta_hat_boot_t = beta_ridge(Y, X, lamb) / beta_hat_boot_SE + + # Bootstrapped p values from t test (two-sided) + beta_hat_boot_p = pd.Series(2 * (1- t.cdf(np.abs(beta_hat_boot_t), df = nobs - K)), beta_hat_boot_t.index) + + return Results_wrap(model = print('Lasso, lambda =', lamb), + coefficients = beta_lasso(Y, X, lamb), + bootstrap_coeffs = beta_hat_boots, + bootstrap_coeffs_var = beta_hat_boot_var, + bootstrap_coeffs_SE = beta_hat_boot_SE, + bootstrap_coeffs_t = beta_hat_boot_t, + bootstrap_coeffs_p = beta_hat_boot_p) + + + +class Results_wrap(object): + ''' + Summarize the Regression Results (based of statsmodels) + Parameters + ----------- + yname : string, optional + Default is `y` + xname : list of strings, optional + Default is `var_##` for ## in p the number of regressors + title : string, optional + Title for the top table. If not None, then this replaces the + default title + alpha : float + significance level for the confidence intervals + Returns + ------- + smry : Summary instance + this holds the summary tables and text, which can be printed or + converted to various output formats. + ''' + def __init__(self, model, coefficients, + cov_type='nonrobust', bootstrap_coeffs=None, bootstrap_coeffs_var=None, bootstrap_coeffs_SE=None, bootstrap_coeffs_t=None, bootstrap_coeffs_p=None): + self.model = model + self.coefficients = coefficients + self.cov_type = cov_type + self.beta_hat_boots = bootstrap_coeffs + self.beta_hat_boots_var = bootstrap_coeffs_var + self.beta_hat_boots_SE = bootstrap_coeffs_SE + self.beta_hat_boots_t = bootstrap_coeffs_t + self.beta_hat_boots_p = bootstrap_coeffs_p + + def summary(self, title=None): + ''' + TO DO: PRINTABLE SUMMARY RESULTS LIKE STATSMODELS + ''' + # testing that summary is callable + if title is None: + title = self.model + ' ' + "Regression Results" + return print(title) + + +def loss(Y, X, lamb, betas): + """ + Compute loss function + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix (with intercept column) + lamb : Lambda value to use for L2 + betas : Vector of estimated coefficients + + Returns + ------- + loss : Computed loss + """ + + loss = sse(Y, X, betas) + lamb * np.sum(np.abs(betas)) + return loss diff --git a/Projects/project_2_packages/paddleboat/LASSO/__init__.py b/Projects/project_2_packages/paddleboat/LASSO/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Projects/project_2_packages/paddleboat/LICENSE b/Projects/project_2_packages/paddleboat/LICENSE new file mode 100644 index 0000000..97c8925 --- /dev/null +++ b/Projects/project_2_packages/paddleboat/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [2019] [NYUPredocs] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Projects/project_2_packages/paddleboat/OLS/OLS.py b/Projects/project_2_packages/paddleboat/OLS/OLS.py new file mode 100644 index 0000000..55912d8 --- /dev/null +++ b/Projects/project_2_packages/paddleboat/OLS/OLS.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +import math +import scipy +np.set_printoptions(suppress=True) + +# Function computing a least squares fit +def beta_ols(Y, X): + ''' + Estimate OLS coefficients + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + beta_hat : vector of coefficients + ''' + + if len(Y.shape) != 1: + return print('ERROR: Y must be an Nx1 matrix') + elif Y.shape[0] != X.shape[0]: + return print('ERROR: Y and X must have the same number of observations') + else: + left = X.transpose().dot(X).values + right = X.transpose().dot(Y).values + beta_hat = np.linalg.inv(left).dot(right) + return beta_hat + + +def resids(Y, X): + ''' + Estimate OLS residuals + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + e : vector of residuals + ''' + e = beta_ols(Y,X).dot(X.values.T)-Y + return e + + +def Sigma(Y,X): + ''' + ≈ + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + Sigma : var-cov matrix as pandas df + ''' + e = resids(Y,X) + std_hat = e.dot(e.T)/(X.shape[1]-1) + Sigma = std_hat*np.linalg.inv(X.transpose().dot(X).values) + return pd.DataFrame(Sigma) + + +def variance_ols(Y,X): + ''' + Estimate OLS variance-covariance matrix + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + var : variance of coefficients + ''' + diags = np.diagonal(Sigma(Y, X)) + var = np.sqrt(diags) + return var + + +def r2_ols(Y, X): + ''' + Estimate R^2 for OLS + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + R2 : value of R^2 + ''' + + y_hat = beta_ols(Y,X).dot(X.values.T) + y_bar = np.mean(Y) + + SSR = np.sum((y_hat - y_bar)**2) + SST = np.sum((Y - y_bar)**2) + + r2 = SSR / SST + return r2 + +def least_sq(Y, X): + ''' + Output nicely OLS results + + Parameters + ---------- + Y : Nx1 Matrix + X : Matrix + + Returns + ------- + R2 : value of R^2 + ''' + + print('Coefficients = ', beta_ols(Y, X)) + print('Coeff. SErrs = ', np.sqrt(np.diagonal(Sigma(Y,X)))) + print('') + print('95% Confidence Interval for Coefficients') + print(' Lower Bound:', beta_ols(Y, X) - 1.96*np.sqrt(np.diagonal(Sigma(Y,X)))) + print(' Upper Bound:', beta_ols(Y, X) + 1.96*np.sqrt(np.diagonal(Sigma(Y,X)))) + print('') + print('R-Squared:', r2_ols(Y,X)) + print('') + print("Variance-Covariance Matrix:") + print(Sigma(Y,X)) + + + diff --git a/Projects/project_2_packages/paddleboat/OLS/OLS_casey.py b/Projects/project_2_packages/paddleboat/OLS/OLS_casey.py new file mode 100644 index 0000000..46da91d --- /dev/null +++ b/Projects/project_2_packages/paddleboat/OLS/OLS_casey.py @@ -0,0 +1,13 @@ +import numpy as np +import pandas as pd + + +def deg_freedom(dependent_variable_data) + if isinstance(dependent_variable_data, np.array): + dimensions = np.shape(dependent_variable_data) + deg_freedom = dimension[0] - dimensions[1] + + return deg_freedom + + + diff --git a/Projects/project_2_packages/paddleboat/OLS/__init__.py b/Projects/project_2_packages/paddleboat/OLS/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Projects/project_2_packages/paddleboat/README.md b/Projects/project_2_packages/paddleboat/README.md new file mode 100644 index 0000000..20e3efb --- /dev/null +++ b/Projects/project_2_packages/paddleboat/README.md @@ -0,0 +1,17 @@ +# paddleboat + +paddling together since 2019. + +## Function + +### OLS + +`OLS.py` implements OLS. + +### LASSO + +`LASSO.py` implements LASSO. + +## Team members + +Nadav Tadelis, Harriet Jeon, Casey McQuillan and Joel Becker. diff --git a/Projects/project_2_packages/paddleboat/__init__.py b/Projects/project_2_packages/paddleboat/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Projects/project_2_packages/paddleboat/setup.py b/Projects/project_2_packages/paddleboat/setup.py new file mode 100644 index 0000000..8af6353 --- /dev/null +++ b/Projects/project_2_packages/paddleboat/setup.py @@ -0,0 +1,22 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="paddleboat2sls", + version="0.0.1", + author="Harriet Jeon, Nadav Tadelis, Casey McQuillan, Joel Becker", + author_email="joelhbkr@gmail.com", + description="Practice publishing an econometric python package", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/nyupredocs/modularizationandtesting", + packages=setuptools.find_packages(), + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) \ No newline at end of file diff --git a/Projects/project_2_packages/paddleboat/simulation/.Rhistory b/Projects/project_2_packages/paddleboat/simulation/.Rhistory new file mode 100755 index 0000000..e69de29 diff --git a/Projects/project_2_packages/paddleboat/simulation/morning.R b/Projects/project_2_packages/paddleboat/simulation/morning.R new file mode 100644 index 0000000..8d06e2e --- /dev/null +++ b/Projects/project_2_packages/paddleboat/simulation/morning.R @@ -0,0 +1,56 @@ +#----------------------------------------------------------------------------------# +# Performs simulations from Sunday morning +# Authors: Harriet, Casey, Nadav, Joel + +# Notes: +# +#----------------------------------------------------------------------------------# + + +######################################################## +######################## Set-up ######################## +######################################################## + +# load libraries +packages <- c("dplyr", "data.table", "ggplot2", "tidyr") +new.packages <- packages[!(packages %in% installed.packages()[, "Package"])] +if(length(new.packages)) install.packages(new.packages) +lapply(packages, library, character.only = TRUE) + + +######################################################## +####################### Functions ###################### +######################################################## + +Markets <- 50 + +demand <- function(alpha, delta, price, demand_shock) { + if(delta > 0) { + stop("Demand curves slope downward, sorry AOC") + } + alpha + delta * price + demand_shock +} + +supply <- function(gamma, psi, price, supply_shock) { + if(psi < 0) { + stop("Supply curves slope downward, sorry el presidente") + } + gamma + psi * price + supply_shock +} + +alpha <- rnorm(1, 15, 2) +delta <- -exp(rnorm(1, 0, 1)) +gamma <- rnorm(1, -2, 1) +psi <- exp(rnorm(1, 0, 1)) +xi_d <- rnorm(Markets) +xi_s <- rnorm(Markets) +supply_instrument <- xi_s + rnorm(Markets) +prices <- rep(0, Markets) + +while(sum(abs(demand(alpha, delta, prices, xi_d) - + supply(gamma, psi, prices, xi_s))) > 0.01) { + prices <- prices + 0.1*(demand(alpha, delta, prices, xi_d) - + supply(gamma, psi, prices, xi_s)) +} + + diff --git a/Projects/project_2_packages/paddleboat/simulation/morning.py b/Projects/project_2_packages/paddleboat/simulation/morning.py new file mode 100644 index 0000000..85175f2 --- /dev/null +++ b/Projects/project_2_packages/paddleboat/simulation/morning.py @@ -0,0 +1,35 @@ +import sys +sys.path.insert(0,'../OLS') + +import numpy as np +import unittest +from OLS import beta_ols, Sigma +import statsmodels.api as sm +import pandas as pd + +X = random_matrix + +def simulation(X, Y): + ''' + Draws single simulations + ''' + beta_hat = norm(0,1) + sigma_hat = exp(norm(0,1)) + y_hat = norm(X * beta_hat, sigma_hat) + + fit = OLS.beta_ols(Y, X) + beta_model = OLS.betas(Y,X) + covariance_model = OLS.sigma(Y,X) + + a = MVN(beta_model, covariance_model) + + return a + +def many_simulations(X, Y, sims=100): + ''' + Draws many simulations + ''' + u = vector(length = sims) + for s in sims: + simulation(X, Y) + u[s] += a diff --git a/Projects/project_2_packages/paddleboat/test/__init__.py b/Projects/project_2_packages/paddleboat/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/Projects/project_2_packages/paddleboat/test/test_OLS.py b/Projects/project_2_packages/paddleboat/test/test_OLS.py new file mode 100644 index 0000000..848319b --- /dev/null +++ b/Projects/project_2_packages/paddleboat/test/test_OLS.py @@ -0,0 +1,150 @@ +import sys +sys.path.insert(0,'../OLS') + +import numpy as np +import transcripty as tpy +import unittest +from OLS import beta_ols, Sigma, r2_ols, least_sq +import statsmodels.api as sm +import pandas as pd + + + +class TestHPM(unittest.TestCase): + + + def test_simple_b_ols(self): + np.random.seed(60683) # testing different seed + n = 50000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_var = Y + + # running model + coeff_estimates = beta_ols(dependent_var, independent_vars) + self.assertIsNone(np.testing.assert_almost_equal([1.5, 2.5, 2, 3, 6], coeff_estimates, decimal=1)) + + + def test_Sigma(self): + np.random.seed(1784) # testing different seed + n = 50000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_var = Y + our_CovMatrix = Sigma(dependent_var, independent_vars) + + model = sm.OLS(dependent_var, independent_vars) + results = model.fit() + stats_CovMatrix = results.cov_params() + + print(our_CovMatrix) + print(stats_CovMatrix) + + + def test_CoefStdErrors(self): + np.random.seed(1435) # testing different seed + n = 50000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_var = Y + our_CoefStdErrors = np.sqrt(np.diagonal(Sigma(dependent_var, independent_vars))) + + model = sm.OLS(dependent_var, independent_vars) + results = model.fit() + stats_CoefStdErrors = results.bse + + #self.assertIsNone(np.testing.assert_almost_equal(our_CoefStdErrors, stats_CoefStdErrors, decimal=1)) + + #print(our_CoefStdErrors) + #print(stats_CoefStdErrors) + + #least_sq(dependent_var, independent_vars) + #print(results.summary()) + + + def test_r2(self): + np.random.seed(17855) # testing different seed + n = 50000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_var = Y + our_r2 = r2_ols(dependent_var, independent_vars) + + model = sm.OLS(dependent_var, independent_vars) + results = model.fit() + stats_r2 = results.rsquared + + self.assertIsNone(np.testing.assert_almost_equal(our_r2, stats_r2, decimal=5)) + + + +if __name__ == "__main__": + unittest.main() diff --git a/Projects/project_2_packages/paddleboat/test/test_RIDGE.py b/Projects/project_2_packages/paddleboat/test/test_RIDGE.py new file mode 100644 index 0000000..83c8eaa --- /dev/null +++ b/Projects/project_2_packages/paddleboat/test/test_RIDGE.py @@ -0,0 +1,79 @@ +import sys +sys.path.insert(0,'../OLS') + +import numpy as np +import unittest +from OLS import beta_ols, Sigma +import statsmodels.api as sm +import pandas as pd + + + +class TestHPM(unittest.TestCase): + + def test_simple_b_ols(self): + np.random.seed(60683) # testing different seed + n = 50000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_vars = Y + + # running model + coeff_estimates = beta_ols(dependent_vars, independent_vars) + self.assertIsNone(np.testing.assert_almost_equal([1.5, 2.5, 2, 3, 6], coeff_estimates, decimal=1)) + + + def test_Sigma(self): + np.random.seed(1435) # testing different seed + n = 500000 + ## DGP ## + means = [3, -1.5, 1.1, 2.3, -1, 3] + cov = [ + [1, 0, 0, 0, 0, 0], + [0, 1, 0, 0, 0, 0], + [0, 0, 1, 0, 0, 0], + [0, 0, 0, 1, 0, 0], + [0, 0, 0, 0, 1, 0], + [0, 0, 0, 0, 0, 1]] + + ## Data Generation ## + X1, X2, X3, X4, Z1, Z2 = np.random.multivariate_normal(means, cov, n).T + epsilon = np.random.normal(0, 1, n) + + # True model + Y = 1.5 + 2.5*X1 + 2*X2 + 3*X3 + 6*X4 + epsilon + + independent_vars = pd.DataFrame({'X1' : X1, 'X2' : X2, 'X3' : X3, 'X4' : X4}) + independent_vars = sm.add_constant(independent_vars) + dependent_vars = Y + + + + # running model + covariance_matrix = Sigma(dependent_vars, independent_vars) + print(covariance_matrix) + test_kc = independent_vars.cov() + print(test_kc) + + #self.assertIsNone(np.testing.assert_almost_equal([1.5, 2.5, 2, 3, 6], coeff_estimates, decimal=1)) + + +if __name__ == "__main__": + unittest.main()