diff --git a/README.md b/README.md index dcd6e23..bdc7411 100644 --- a/README.md +++ b/README.md @@ -1,46 +1 @@ -# Classifing spam - -## Description - -Use the Spambase dataset to classify spam. This data is already parsed down from email to features. - -## Objectives - -### Learning Objectives - -After completing this assignment, you should understand: - -* Simple Bayesian analysis -* The importance of separating training and test data - -### Performance Objectives - -After completing this assignment, you should be able to: - -* Create a Bayesian classifier -* Train your classifier -* Test your classifier - -## Details - -### Deliverables - -* A Git repo called spambase containing at least: - * `README.md` file explaining how to run your project - * a `requirements.txt` file - -### Requirements - -* No PEP8 or Pyflakes warnings or errors - -## Normal Mode - -Go to the UCI Machine Learning repository and [download the Spambase dataset](https://archive.ics.uci.edu/ml/datasets/Spambase). Make sure you [read the documentation for the data](https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.DOCUMENTATION). This explains what the attributes are in the data file. - -Subsample the data set so 60% is training data and 40% is test data. You can subsample however you like, including splitting the original file. Just make sure that you have a representative data set. (The original is about 60% not-spam and 40% spam.) - -Then write code to classify the data into spam and not-spam, training with your training data and testing on your test data. Try multiple classifiers to see which gives you the highest success. - -## Hard Mode - -In addition to the normal mode requirements, try reducing or changing your features in order to get better results. +The entirety of my code and analysis is contained within an IPython Notebook folder titled spambase.ipynb. diff --git a/requirements.txt b/requirements.txt index ed8975d..759d71f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +ipython[all] scikit-learn pandas numpy diff --git a/spambase.ipynb b/spambase.ipynb new file mode 100644 index 0000000..61b396a --- /dev/null +++ b/spambase.ipynb @@ -0,0 +1,849 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:6b28f1e85f1b01944691cf3cd5e7fc252c80c8193bfff35d7ee22478ec2cd2aa" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.naive_bayes import MultinomialNB\n", + "from sklearn import metrics" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 92 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data = pd.read_csv(\"spambase.data\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 3 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "data.head()" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
00.640.64.10.10.320.20.30.40.50.6...0.400.410.420.7780.430.443.756612781
0 0.21 0.28 0.50 0 0 0.28 0.21 0.07 0.00 0.94... 0.00 0.132 0 0.372 0.180 0.048 5.114 101 1028 1
1 0.06 0.00 0.71 0 0 0.19 0.19 0.12 0.64 0.25... 0.01 0.143 0 0.276 0.184 0.010 9.821 485 2259 1
2 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63... 0.00 0.137 0 0.137 0.000 0.000 3.537 40 191 1
3 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63... 0.00 0.135 0 0.135 0.000 0.000 3.537 40 191 1
4 0.00 0.00 0.00 0 0 0.00 0.00 1.85 0.00 0.00... 0.00 0.223 0 0.000 0.000 0.000 3.000 15 54 1
\n", + "

5 rows \u00d7 58 columns

\n", + "
" + ], + "metadata": {}, + "output_type": "pyout", + "prompt_number": 31, + "text": [ + " 0 0.64 0.64.1 0.1 0.32 0.2 0.3 0.4 0.5 0.6 ... 0.40 \\\n", + "0 0.21 0.28 0.50 0 0 0.28 0.21 0.07 0.00 0.94 ... 0.00 \n", + "1 0.06 0.00 0.71 0 0 0.19 0.19 0.12 0.64 0.25 ... 0.01 \n", + "2 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63 ... 0.00 \n", + "3 0.00 0.00 0.00 0 0 0.00 0.31 0.63 0.31 0.63 ... 0.00 \n", + "4 0.00 0.00 0.00 0 0 0.00 0.00 1.85 0.00 0.00 ... 0.00 \n", + "\n", + " 0.41 0.42 0.778 0.43 0.44 3.756 61 278 1 \n", + "0 0.132 0 0.372 0.180 0.048 5.114 101 1028 1 \n", + "1 0.143 0 0.276 0.184 0.010 9.821 485 2259 1 \n", + "2 0.137 0 0.137 0.000 0.000 3.537 40 191 1 \n", + "3 0.135 0 0.135 0.000 0.000 3.537 40 191 1 \n", + "4 0.223 0 0.000 0.000 0.000 3.000 15 54 1 \n", + "\n", + "[5 rows x 58 columns]" + ] + } + ], + "prompt_number": 31 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "xdata = data.loc[::,\"0\":\"278\"]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 84 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "ydata = data[\"1\"]" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 85 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import train_test_split\n", + "x_train, x_test, y_train, y_test = train_test_split(xdata, ydata, test_size=0.4, random_state=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 86 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Create a function to print out the results of each classifier:" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "def run_classifier(clf, x_train, x_test, y_train, y_test, x_data, y_data):\n", + " clf.fit(x_train, y_train)\n", + " predicted = clf.predict(x_test)\n", + " print(metrics.classification_report(y_test, predicted))\n", + " print(metrics.confusion_matrix(y_test, predicted))\n", + " print(metrics.f1_score(y_test, predicted))\n", + " scores = cross_val_score(clf, xdata, ydata, cv=5)\n", + " " + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 94 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Multinomial Naive Bayes" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "bayclass = MultinomialNB()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 87 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(bayclass, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.82 0.83 0.83 1097\n", + " 1 0.75 0.74 0.74 743\n", + "\n", + "avg / total 0.79 0.79 0.79 1840\n", + "\n", + "[[912 185]\n", + " [196 547]]\n", + "0.741694915254\n" + ] + } + ], + "prompt_number": 104 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Gaussian Naive Bayes" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.naive_bayes import GaussianNB" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 32 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "gauss = GaussianNB()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 33 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(gauss, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.97 0.69 0.80 1097\n", + " 1 0.68 0.97 0.80 743\n", + "\n", + "avg / total 0.85 0.80 0.80 1840\n", + "\n", + "[[753 344]\n", + " [ 23 720]]\n", + "0.796900940786\n" + ] + } + ], + "prompt_number": 96 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Decision Tree Classifier" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.tree import DecisionTreeClassifier\n", + "tree = DecisionTreeClassifier()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 99 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(tree, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.91 0.93 1097\n", + " 1 0.88 0.92 0.90 743\n", + "\n", + "avg / total 0.92 0.91 0.91 1840\n", + "\n", + "[[1003 94]\n", + " [ 63 680]]\n", + "0.89650626236\n" + ] + } + ], + "prompt_number": 98 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Random Forest Classifier" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.ensemble import RandomForestClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 53 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "forest = RandomForestClassifier()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 101 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(forest, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.97 0.96 1097\n", + " 1 0.96 0.92 0.94 743\n", + "\n", + "avg / total 0.95 0.95 0.95 1840\n", + "\n", + "[[1069 28]\n", + " [ 63 680]]\n", + "0.937284631289\n" + ] + } + ], + "prompt_number": 100 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "K Neighbors Classifier" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.neighbors import KNeighborsClassifier" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 56 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "neighbors = KNeighborsClassifier()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 103 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(neighbors, x_train, x_test, y_train, y_test, xdata, ydata)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.81 0.85 0.83 1097\n", + " 1 0.76 0.70 0.73 743\n", + "\n", + "avg / total 0.79 0.79 0.79 1840\n", + "\n", + "[[936 161]\n", + " [220 523]]\n", + "0.733006306938\n" + ] + } + ], + "prompt_number": 102 + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "Can we figure out which features are helpful and which aren't?" + ] + }, + { + "cell_type": "heading", + "level": 3, + "metadata": {}, + "source": [ + "I tried something called ExtraTreesClassifier" + ] + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.ensemble import ExtraTreesClassifier\n", + "\n", + "X, y = xdata, ydata\n", + "X.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 42, + "text": [ + "(4600, 57)" + ] + } + ], + "prompt_number": 42 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "clf = ExtraTreesClassifier()\n", + "X_new = clf.fit(X, y).transform(X)\n", + "clf.feature_importances_ " + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 43, + "text": [ + "array([ 0.01329835, 0.0079715 , 0.01509007, 0.0021716 , 0.00276668,\n", + " 0.01658062, 0.09933421, 0.01570107, 0.02455518, 0.00837051,\n", + " 0.0183419 , 0.02024683, 0.00774485, 0.00722078, 0.01486713,\n", + " 0.04068142, 0.03525079, 0.01349685, 0.04942057, 0.00811502,\n", + " 0.07483391, 0.0100333 , 0.0315938 , 0.01704934, 0.03070221,\n", + " 0.01114189, 0.0291203 , 0.01327662, 0.00971931, 0.01131513,\n", + " 0.00479219, 0.00280843, 0.00443832, 0.00210647, 0.00783549,\n", + " 0.00497984, 0.02002667, 0.00151691, 0.00518695, 0.00257288,\n", + " 0.0024335 , 0.01093254, 0.00655802, 0.00576946, 0.01401507,\n", + " 0.02525882, 0.00134001, 0.00270929, 0.01051194, 0.01370621,\n", + " 0.00558226, 0.05104588, 0.02472411, 0.00536161, 0.03504849,\n", + " 0.037736 , 0.03099089])" + ] + } + ], + "prompt_number": 43 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "X_new.shape" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 44, + "text": [ + "(4600, 18)" + ] + } + ], + "prompt_number": 44 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "X_new" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 48, + "text": [ + "array([[ 2.10000000e-01, 0.00000000e+00, 2.10000000e-01, ...,\n", + " 5.11400000e+00, 1.01000000e+02, 1.02800000e+03],\n", + " [ 1.90000000e-01, 6.40000000e-01, 3.80000000e-01, ...,\n", + " 9.82100000e+00, 4.85000000e+02, 2.25900000e+03],\n", + " [ 3.10000000e-01, 3.10000000e-01, 3.10000000e-01, ...,\n", + " 3.53700000e+00, 4.00000000e+01, 1.91000000e+02],\n", + " ..., \n", + " [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,\n", + " 1.40400000e+00, 6.00000000e+00, 1.18000000e+02],\n", + " [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,\n", + " 1.14700000e+00, 5.00000000e+00, 7.80000000e+01],\n", + " [ 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,\n", + " 1.25000000e+00, 5.00000000e+00, 4.00000000e+01]])" + ] + } + ], + "prompt_number": 48 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "x_newtrain, x_newtest, y_newtrain, y_newtest = train_test_split(X_new, y, test_size=0.4, random_state=0)" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 105 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "newtree = DecisionTreeClassifier()" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 107 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "run_classifier(newtree, x_newtrain, x_newtest, y_newtrain, y_newtest, X_new, y)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.92 0.92 1097\n", + " 1 0.88 0.89 0.88 743\n", + "\n", + "avg / total 0.91 0.91 0.91 1840\n", + "\n", + "[[1006 91]\n", + " [ 82 661]]\n", + "0.884280936455\n" + ] + } + ], + "prompt_number": 108 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "newmultibay = MultinomialNB()\n", + "run_classifier(newmultibay, x_newtrain, x_newtest, y_newtrain, y_newtest, X_new, y)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.76 0.75 0.76 1097\n", + " 1 0.64 0.65 0.65 743\n", + "\n", + "avg / total 0.71 0.71 0.71 1840\n", + "\n", + "[[824 273]\n", + " [259 484]]\n", + "0.645333333333\n" + ] + } + ], + "prompt_number": 111 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "newgauss = GaussianNB()\n", + "run_classifier(newgauss, x_newtrain, x_newtest, y_newtrain, y_newtest, X_new, y)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.89 0.94 0.92 1097\n", + " 1 0.91 0.84 0.87 743\n", + "\n", + "avg / total 0.90 0.90 0.90 1840\n", + "\n", + "[[1035 62]\n", + " [ 122 621]]\n", + "0.870967741935\n" + ] + } + ], + "prompt_number": 112 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "newforest = RandomForestClassifier()\n", + "run_classifier(newforest, x_newtrain, x_newtest, y_newtrain, y_newtest, X_new, y)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.94 0.96 0.95 1097\n", + " 1 0.94 0.91 0.93 743\n", + "\n", + "avg / total 0.94 0.94 0.94 1840\n", + "\n", + "[[1051 46]\n", + " [ 64 679]]\n", + "0.925068119891\n" + ] + } + ], + "prompt_number": 113 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "newneighbors = KNeighborsClassifier()\n", + "run_classifier(newneighbors, x_newtrain, x_newtest, y_newtrain, y_newtest, X_new, y)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.79 0.86 0.82 1097\n", + " 1 0.76 0.67 0.71 743\n", + "\n", + "avg / total 0.78 0.78 0.78 1840\n", + "\n", + "[[939 158]\n", + " [246 497]]\n", + "0.711015736767\n" + ] + } + ], + "prompt_number": 114 + } + ], + "metadata": {} + } + ] +} \ No newline at end of file