From c2bf032d74a0e9c97267da8cfef20a4b8b93bec5 Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Tue, 21 Feb 2017 13:35:44 -0800
Subject: [PATCH 01/10] Initial commit of k-means classifier.

---
 src/kmeans.py      | 23 +++++++++++++++++++++++
 src/test_kmeans.py |  1 +
 2 files changed, 24 insertions(+)
 create mode 100644 src/kmeans.py
 create mode 100644 src/test_kmeans.py

diff --git a/src/kmeans.py b/src/kmeans.py
new file mode 100644
index 0000000..6f63c53
--- /dev/null
+++ b/src/kmeans.py
@@ -0,0 +1,23 @@
+"""Implementation of the K-Means Classifier."""
+
+
+class KMeansClassifier(object):
+    """Implementation of the K-Means Classifier."""
+
+    def __init__(self, max_iter=None, min_step=None):
+        """Initialize a K-Means Classifier object."""
+        self.max_iter = max_iter
+        self.min_step = min_step
+        self.fitted = False
+
+    def fit(self, data, k=2):
+        """Fit K centroids to given data."""
+        if k < 0 or k > len(data):
+            raise ValueError("K must a positive integer less than the length of data.")
+
+        self.fitted = True
+        pass
+
+    def predict(self, data):
+        """Predict the class of given test data after fit."""
+        pass
\ No newline at end of file
diff --git a/src/test_kmeans.py b/src/test_kmeans.py
new file mode 100644
index 0000000..9aeb4b0
--- /dev/null
+++ b/src/test_kmeans.py
@@ -0,0 +1 @@
+"""Tests for the K-Means Classifier."""
\ No newline at end of file

From 4bb18c8f3947014f3e960183cdcd549fbd27ea24 Mon Sep 17 00:00:00 2001
From: pasaunders <kaorti@gmail.com>
Date: Tue, 21 Feb 2017 14:48:27 -0800
Subject: [PATCH 02/10] sketching out program structure

---
 src/kmeans.py | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 6f63c53..3869f49 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -15,9 +15,44 @@ def fit(self, data, k=2):
         if k < 0 or k > len(data):
             raise ValueError("K must a positive integer less than the length of data.")
 
+        centriods = randomCentroids(k)
+
+        iteration = 0
+        old_centroids = None
+
+        while not _should_stop(old_centroids, centroids, iteration):
+            old_centroids = centroids
+            iteration += 1
+
+            labels = _classify(data, centroids)
+
+            centroids = _assign_centroids(data, labels, k)
+
         self.fitted = True
         pass
 
     def predict(self, data):
         """Predict the class of given test data after fit."""
-        pass
\ No newline at end of file
+        if self.fitted == False:
+            raise RuntimeError('Run KMeansClassifier.fit before running predict')
+        pass
+
+    def _calc_distance(self, pt1, pt2):
+        """Calculate the distance between two points."""
+        pass
+
+    def _classify(self, data, centroids):
+        """Assign each datapoint to the nearest centroid."""
+        pass
+
+    def _find_mean(self, points):
+        """Find the mean coordinates of points."""
+        pass
+
+    def _assign_centroids(self, data, labels, k):
+        """Assign centriod coordinates based on distance to member points."""
+        pass
+
+    def _should_stop(self, old_centroids, centroids, iteration):
+        """Determine if the fit should stop runnng."""
+        pass

From c17a1193a87ce0e0ae0e0c5a64bae42666a47b06 Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Tue, 21 Feb 2017 15:51:06 -0800
Subject: [PATCH 03/10] Wrote Calc distance, copied in additions from Julien
 and Rick, wrote unit test for _calc_distance.

---
 src/kmeans.py      | 33 ++++++++++++++++++++++++---------
 src/test_kmeans.py | 20 +++++++++++++++++++-
 2 files changed, 43 insertions(+), 10 deletions(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 3869f49..25308de 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -1,45 +1,50 @@
 """Implementation of the K-Means Classifier."""
+import random
+from math import sqrt
 
 
 class KMeansClassifier(object):
     """Implementation of the K-Means Classifier."""
 
-    def __init__(self, max_iter=None, min_step=None):
+    def __init__(self, max_iter=5, min_step=None):
         """Initialize a K-Means Classifier object."""
         self.max_iter = max_iter
         self.min_step = min_step
         self.fitted = False
+        self.centroids = None
 
     def fit(self, data, k=2):
         """Fit K centroids to given data."""
         if k < 0 or k > len(data):
             raise ValueError("K must a positive integer less than the length of data.")
 
-        centriods = randomCentroids(k)
+        centroids = self._random_centroids(k)
 
         iteration = 0
         old_centroids = None
 
-        while not _should_stop(old_centroids, centroids, iteration):
+        while not self._should_stop(old_centroids, centroids, iteration):
             old_centroids = centroids
             iteration += 1
 
-            labels = _classify(data, centroids)
+            labels = self._classify(data, centroids)
 
-            centroids = _assign_centroids(data, labels, k)
+            centroids = self._assign_centroids(data, labels, k)
 
         self.fitted = True
-        pass
 
     def predict(self, data):
         """Predict the class of given test data after fit."""
-        if self.fitted == False:
+        if self.fitted is False:
             raise RuntimeError('Run KMeansClassifier.fit before running predict')
         pass
 
     def _calc_distance(self, pt1, pt2):
         """Calculate the distance between two points."""
-        pass
+        dist = 0.0
+        for i in range(len(pt1) - 2):
+            dist += (pt1[i] - pt2[i])**2
+        return sqrt(dist)
 
     def _classify(self, data, centroids):
         """Assign each datapoint to the nearest centroid."""
@@ -47,7 +52,10 @@ def _classify(self, data, centroids):
 
     def _find_mean(self, points):
         """Find the mean coordinates of points."""
-        pass
+        col_means = []
+        for column in points:
+            col_means.append(column.mean)
+        return col_means
 
     def _assign_centroids(self, data, labels, k):
         """Assign centriod coordinates based on distance to member points."""
@@ -56,3 +64,10 @@ def _assign_centroids(self, data, labels, k):
     def _should_stop(self, old_centroids, centroids, iteration):
         """Determine if the fit should stop runnng."""
         pass
+
+    def _random_centroids(self, data, k):
+        """Return randomly generated centroids."""
+        k_list = []
+        for i in range(k):
+            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in range(len(data))])
+        return k_list
diff --git a/src/test_kmeans.py b/src/test_kmeans.py
index 9aeb4b0..4affe51 100644
--- a/src/test_kmeans.py
+++ b/src/test_kmeans.py
@@ -1 +1,19 @@
-"""Tests for the K-Means Classifier."""
\ No newline at end of file
+"""Tests for the K-Means Classifier."""
+
+from math import sqrt
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def kmc():
+    """Fixture to return a default KMC."""
+    from kmeans import KMeansClassifier
+    return KMeansClassifier()
+
+
+def test_calc_distance(kmc):
+    """Test the _calc_distance method of the K Means Classifier."""
+    rows = [[2, 2, 1], [0, 0, 1]]
+    data = pd.DataFrame(data=rows, columns=['x', 'y', 'class'])
+    assert kmc._calc_distance(data.loc[0], data.loc[1]) == sqrt(8)

From ea79357e1b41793353ca8b5a4c52caf72cc3f340 Mon Sep 17 00:00:00 2001
From: pasaunders <kaorti@gmail.com>
Date: Tue, 21 Feb 2017 18:07:01 -0800
Subject: [PATCH 04/10] added tests and troubleshooting

---
 src/kmeans.py      | 60 ++++++++++++++++++++++++++++++----------------
 src/test_kmeans.py | 52 +++++++++++++++++++++++++++++++++++++---
 2 files changed, 88 insertions(+), 24 deletions(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 25308de..aed2d8a 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -16,58 +16,76 @@ def __init__(self, max_iter=5, min_step=None):
     def fit(self, data, k=2):
         """Fit K centroids to given data."""
         if k < 0 or k > len(data):
-            raise ValueError("K must a positive integer less than the length of data.")
-
-        centroids = self._random_centroids(k)
+            raise ValueError("K must be a positive integer less than the length of data.")
 
+        data['group'] = None
+        self.centroids = self._random_centroids(data, k)
         iteration = 0
         old_centroids = None
 
-        while not self._should_stop(old_centroids, centroids, iteration):
-            old_centroids = centroids
+        while not self._should_stop(old_centroids, iteration, k):
+            old_centroids = self.centroids
             iteration += 1
-
-            labels = self._classify(data, centroids)
-
-            centroids = self._assign_centroids(data, labels, k)
-
+            self._classify(data)
+            self.centroids = self._assign_centroids(data, k)
         self.fitted = True
 
     def predict(self, data):
         """Predict the class of given test data after fit."""
         if self.fitted is False:
-            raise RuntimeError('Run KMeansClassifier.fit before running predict')
-        pass
+            raise RuntimeError('Run KMeansClassifier.fit before running predict.')
+        self._classify(data)
+        return data
 
     def _calc_distance(self, pt1, pt2):
         """Calculate the distance between two points."""
         dist = 0.0
+        print('distance called from: ', pt1, pt2)
         for i in range(len(pt1) - 2):
+            print('i:', i, 'pt1:', pt1[i], 'pt2:', pt2[i])
             dist += (pt1[i] - pt2[i])**2
         return sqrt(dist)
 
-    def _classify(self, data, centroids):
+    def _classify(self, data):
         """Assign each datapoint to the nearest centroid."""
-        pass
+        for point in data.iterrows():
+            print('point: ', point)
+            distances = []
+            for cent in self.centroids:
+                print('cent', cent)
+                distances.append(self._calc_distance(cent, point))
+            point.group = distances.index(min(distances))
 
     def _find_mean(self, points):
         """Find the mean coordinates of points."""
         col_means = []
         for column in points:
-            col_means.append(column.mean)
+            col_means.append(points[column].mean())
         return col_means
 
-    def _assign_centroids(self, data, labels, k):
-        """Assign centriod coordinates based on distance to member points."""
-        pass
+    def _assign_centroids(self, data, k):
+        """Set centroid coordinates to mean of their assigned datapoints."""
+        groups = []
+        for _ in range(k):
+            groups.append(data[data["groups"] == k])
+        for idx, group in enumerate(groups):
+            self.centroids[idx] = self._find_mean(group)
 
-    def _should_stop(self, old_centroids, centroids, iteration):
+    def _should_stop(self, old_centroids, centroids, iteration, k):
         """Determine if the fit should stop runnng."""
-        pass
+        if iteration > self.max_iter:
+            return True
+        centroid_movements = []
+        for i in range(k):
+            centroid_movements.append(self._calc_distance(old_centroids[i], centroids[i]))
+        if self.min_step:
+            if max(centroid_movements) < self.min_step:
+                return True
+        return False
 
     def _random_centroids(self, data, k):
         """Return randomly generated centroids."""
         k_list = []
         for i in range(k):
-            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in range(len(data))])
+            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in range(len(data) - 2)])
         return k_list
diff --git a/src/test_kmeans.py b/src/test_kmeans.py
index 4affe51..942a103 100644
--- a/src/test_kmeans.py
+++ b/src/test_kmeans.py
@@ -3,6 +3,7 @@
 from math import sqrt
 import pandas as pd
 import pytest
+import numpy as np
 
 
 @pytest.fixture
@@ -12,8 +13,53 @@ def kmc():
     return KMeansClassifier()
 
 
-def test_calc_distance(kmc):
+@pytest.fixture
+def some_data():
+    """Fixture to return some dummy data."""
+    data = np.array([[2, 3], [4, 5], [6, 7], [8, 9], [1, 1], [2, 2], [3, 3], [4, 4]])
+    return pd.DataFrame(data=data)
+
+
+def test_calc_distance_rows(kmc):
     """Test the _calc_distance method of the K Means Classifier."""
-    rows = [[2, 2, 1], [0, 0, 1]]
-    data = pd.DataFrame(data=rows, columns=['x', 'y', 'class'])
+    rows = [[2, 2, 1, 0], [0, 0, 1, 0]]
+    data = pd.DataFrame(data=rows, columns=['x', 'y', 'class', 'dummy'])
     assert kmc._calc_distance(data.loc[0], data.loc[1]) == sqrt(8)
+
+
+def test_calc_distance(kmc):
+    """Test distance calculator helper method."""
+    assert kmc._calc_distance([0, 0, 0, 0], [3, 4, 0, 0]) == 5.0
+
+
+def test_find_mean(kmc, some_data):
+    """Unit test for find mean."""
+    data_means = kmc._find_mean(some_data)
+    assert data_means == [3.75, 4.25]
+
+
+def test_should_stop_false_max_iter(kmc):
+    """Test that a kmc does not doesnt stop iterating, with only a max iter of 5 set."""
+    old_centroids = [[2, 3], [7, 8]]
+    new_centroids = [[2, 4], [90, 55]]
+    assert not kmc._should_stop(old_centroids, new_centroids, 3, 2)
+
+
+def test_should_stop_true_max_iter(kmc):
+    """Test that a kmc does not doesnt stop iterating, with only a max iter of 5 set."""
+    old_centroids = [[2, 3], [7, 8]]
+    new_centroids = [[2, 4], [90, 55]]
+    assert kmc._should_stop(old_centroids, new_centroids, 6, 2)
+
+
+def test_classify():
+    """Unit test for classifying datapoints."""
+    from kmeans import KMeansClassifier
+    clusters = KMeansClassifier()
+    rows = [[4, 4, 1, 0], [0, 0, 1, 0]]
+    data = pd.DataFrame(data=rows, columns=['x', 'y', 'class', 'group'])
+    clusters.centroids = None
+    clusters.centroids = [[4, 5, 0, 0], [0, 1, 0, 0]]
+    # import pdb; pdb.set_trace()
+    clusters._classify(data)
+    assert True

From f3a06bd9a8e44ec74c4cd1403b55f1777f59ec2f Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Tue, 21 Feb 2017 20:13:38 -0800
Subject: [PATCH 05/10] troubleshot _classify function and tests.

---
 src/kmeans.py      | 6 +++---
 src/test_kmeans.py | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index aed2d8a..9fada1a 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -47,14 +47,14 @@ def _calc_distance(self, pt1, pt2):
         return sqrt(dist)
 
     def _classify(self, data):
-        """Assign each datapoint to the nearest centroid."""
+        """Assign each datapoint t o the nearest centroid."""
         for point in data.iterrows():
             print('point: ', point)
             distances = []
             for cent in self.centroids:
                 print('cent', cent)
-                distances.append(self._calc_distance(cent, point))
-            point.group = distances.index(min(distances))
+                distances.append(self._calc_distance(cent, point[1]))
+            point[1]["group"] = distances.index(min(distances))
 
     def _find_mean(self, points):
         """Find the mean coordinates of points."""
diff --git a/src/test_kmeans.py b/src/test_kmeans.py
index 942a103..ec7e472 100644
--- a/src/test_kmeans.py
+++ b/src/test_kmeans.py
@@ -60,6 +60,5 @@ def test_classify():
     data = pd.DataFrame(data=rows, columns=['x', 'y', 'class', 'group'])
     clusters.centroids = None
     clusters.centroids = [[4, 5, 0, 0], [0, 1, 0, 0]]
-    # import pdb; pdb.set_trace()
     clusters._classify(data)
     assert True

From 121fb90685cf94686968fd82d152585b2c9e42a3 Mon Sep 17 00:00:00 2001
From: pasaunders <kaorti@gmail.com>
Date: Tue, 21 Feb 2017 23:06:25 -0800
Subject: [PATCH 06/10] added a passing test of fit

---
 README.MD          |  5 +++++
 src/kmeans.py      | 31 ++++++++++++++++++-------------
 src/test_kmeans.py | 29 +++++++++++++++++++++++------
 3 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/README.MD b/README.MD
index 1ee38ee..e12ed7d 100644
--- a/README.MD
+++ b/README.MD
@@ -55,3 +55,8 @@ in Python containing the following methods:
     size: O(1)
     remove: O(n**2)
     traversal: O(k) + O(2n)
+
+
+Kmeans:
+    collaboration note - this program was a collaboration between Ted,
+    Patrick, Rick, Julien, and Avery
\ No newline at end of file
diff --git a/src/kmeans.py b/src/kmeans.py
index 9fada1a..03d1029 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -18,16 +18,17 @@ def fit(self, data, k=2):
         if k < 0 or k > len(data):
             raise ValueError("K must be a positive integer less than the length of data.")
 
-        data['group'] = None
+        data['group'] = 0
+
         self.centroids = self._random_centroids(data, k)
         iteration = 0
-        old_centroids = None
+        old_centroids = self.centroids
 
         while not self._should_stop(old_centroids, iteration, k):
             old_centroids = self.centroids
             iteration += 1
             self._classify(data)
-            self.centroids = self._assign_centroids(data, k)
+            self._assign_centroids(data, k)
         self.fitted = True
 
     def predict(self, data):
@@ -47,14 +48,14 @@ def _calc_distance(self, pt1, pt2):
         return sqrt(dist)
 
     def _classify(self, data):
-        """Assign each datapoint t o the nearest centroid."""
+        """Assign each datapoint to the nearest centroid."""
         for point in data.iterrows():
             print('point: ', point)
             distances = []
             for cent in self.centroids:
                 print('cent', cent)
                 distances.append(self._calc_distance(cent, point[1]))
-            point[1]["group"] = distances.index(min(distances))
+            data.set_value(point[0], "group", distances.index(min(distances)))
 
     def _find_mean(self, points):
         """Find the mean coordinates of points."""
@@ -65,19 +66,20 @@ def _find_mean(self, points):
 
     def _assign_centroids(self, data, k):
         """Set centroid coordinates to mean of their assigned datapoints."""
-        groups = []
-        for _ in range(k):
-            groups.append(data[data["groups"] == k])
-        for idx, group in enumerate(groups):
-            self.centroids[idx] = self._find_mean(group)
+        groups = data.group.unique().tolist()
+        # for idx, group in enumerate(groups):
+        for value in groups:
+            self.centroids[value] = self._find_mean(data.loc[data['group'] == value])
 
-    def _should_stop(self, old_centroids, centroids, iteration, k):
+    def _should_stop(self, old_centroids, iteration, k):
         """Determine if the fit should stop runnng."""
+        if iteration < 1:
+            return False
         if iteration > self.max_iter:
             return True
         centroid_movements = []
         for i in range(k):
-            centroid_movements.append(self._calc_distance(old_centroids[i], centroids[i]))
+            centroid_movements.append(self._calc_distance(old_centroids[i], self.centroids[i]))
         if self.min_step:
             if max(centroid_movements) < self.min_step:
                 return True
@@ -87,5 +89,8 @@ def _random_centroids(self, data, k):
         """Return randomly generated centroids."""
         k_list = []
         for i in range(k):
-            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in range(len(data) - 2)])
+            centroid = []
+            for column in data.columns.values:
+                centroid.append(random.uniform(min(data[column]), max(data[column])))
+            k_list.append(centroid)
         return k_list
diff --git a/src/test_kmeans.py b/src/test_kmeans.py
index ec7e472..7489f59 100644
--- a/src/test_kmeans.py
+++ b/src/test_kmeans.py
@@ -40,16 +40,20 @@ def test_find_mean(kmc, some_data):
 
 def test_should_stop_false_max_iter(kmc):
     """Test that a kmc does not doesnt stop iterating, with only a max iter of 5 set."""
+    from kmeans import KMeansClassifier
+    clusters = KMeansClassifier()
     old_centroids = [[2, 3], [7, 8]]
-    new_centroids = [[2, 4], [90, 55]]
-    assert not kmc._should_stop(old_centroids, new_centroids, 3, 2)
+    clusters.centroids = [[2, 4], [90, 55]]
+    assert not clusters._should_stop(old_centroids, 3, 2)
 
 
 def test_should_stop_true_max_iter(kmc):
     """Test that a kmc does not doesnt stop iterating, with only a max iter of 5 set."""
-    old_centroids = [[2, 3], [7, 8]]
-    new_centroids = [[2, 4], [90, 55]]
-    assert kmc._should_stop(old_centroids, new_centroids, 6, 2)
+    from kmeans import KMeansClassifier
+    clusters = KMeansClassifier()
+    old_centroids = [[2, 3, 0, 0], [7, 8, 0, 0]]
+    clusters.centroids = [[2, 4], [90, 55]]
+    assert clusters._should_stop(old_centroids, 6, 2)
 
 
 def test_classify():
@@ -61,4 +65,17 @@ def test_classify():
     clusters.centroids = None
     clusters.centroids = [[4, 5, 0, 0], [0, 1, 0, 0]]
     clusters._classify(data)
-    assert True
+    assert data.iloc[0, 3] == 0 and data.iloc[1, 3] == 1
+
+
+def test_fit():
+    """Unit integration test for fitting centroids."""
+    from kmeans import KMeansClassifier
+    clusters = KMeansClassifier()
+    rows = [[20, 20, 1, 0], [0, 0, 1, 0]]
+    data = pd.DataFrame(data=rows, columns=['x', 'y', 'class', 'group'])
+    clusters.fit(data)
+    assert clusters.fitted
+
+
+# def test_random_centroids()

From aa748e9ff5617932d4b8e9d595f79d4adf2ecfd5 Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Wed, 22 Feb 2017 13:24:16 -0800
Subject: [PATCH 07/10] Removed print statement

---
 src/kmeans.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 9fada1a..3b8be1f 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -42,7 +42,6 @@ def _calc_distance(self, pt1, pt2):
         dist = 0.0
         print('distance called from: ', pt1, pt2)
         for i in range(len(pt1) - 2):
-            print('i:', i, 'pt1:', pt1[i], 'pt2:', pt2[i])
             dist += (pt1[i] - pt2[i])**2
         return sqrt(dist)
 

From d5a72c0763aee2b96634fe837b2365ed39d24b3c Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Wed, 22 Feb 2017 16:10:43 -0800
Subject: [PATCH 08/10] Troubleshot various components of calculating and
 assigning centroids and rewrote predict.

---
 src/kmeans.py | 39 +++++++++++++++++++++------------------
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 3b8be1f..5d772f2 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -24,36 +24,37 @@ def fit(self, data, k=2):
         old_centroids = None
 
         while not self._should_stop(old_centroids, iteration, k):
+            print("Iteration: ", iteration)
             old_centroids = self.centroids
             iteration += 1
-            self._classify(data)
-            self.centroids = self._assign_centroids(data, k)
+            data = self._classify(data)
+            self._assign_centroids(data, k)
         self.fitted = True
 
     def predict(self, data):
         """Predict the class of given test data after fit."""
         if self.fitted is False:
             raise RuntimeError('Run KMeansClassifier.fit before running predict.')
-        self._classify(data)
-        return data
+        distances = []
+        for centroid in self.centroids:
+            distances.append((centroid[-1], self._calc_distance(data, centroid[:-1])))
+        return min(distances, key=lambda x: x[1])[0]
 
     def _calc_distance(self, pt1, pt2):
         """Calculate the distance between two points."""
         dist = 0.0
-        print('distance called from: ', pt1, pt2)
         for i in range(len(pt1) - 2):
             dist += (pt1[i] - pt2[i])**2
         return sqrt(dist)
 
     def _classify(self, data):
-        """Assign each datapoint t o the nearest centroid."""
-        for point in data.iterrows():
-            print('point: ', point)
+        """Assign each datapoint to the nearest centroid."""
+        for i in range(len(data)):
             distances = []
             for cent in self.centroids:
-                print('cent', cent)
-                distances.append(self._calc_distance(cent, point[1]))
-            point[1]["group"] = distances.index(min(distances))
+                distances.append(self._calc_distance(cent, data.iloc[i]))
+            data.set_value(i, 'group', distances.index(min(distances)))
+        return data
 
     def _find_mean(self, points):
         """Find the mean coordinates of points."""
@@ -65,18 +66,20 @@ def _find_mean(self, points):
     def _assign_centroids(self, data, k):
         """Set centroid coordinates to mean of their assigned datapoints."""
         groups = []
-        for _ in range(k):
-            groups.append(data[data["groups"] == k])
+        for i in range(k):
+            group = data[data["group"] == i]
+            groups.append(group)
         for idx, group in enumerate(groups):
             self.centroids[idx] = self._find_mean(group)
 
-    def _should_stop(self, old_centroids, centroids, iteration, k):
+    def _should_stop(self, old_centroids, iteration, k):
         """Determine if the fit should stop runnng."""
         if iteration > self.max_iter:
             return True
-        centroid_movements = []
-        for i in range(k):
-            centroid_movements.append(self._calc_distance(old_centroids[i], centroids[i]))
+        if old_centroids:
+            centroid_movements = []
+            for i in range(k):
+                centroid_movements.append(self._calc_distance(old_centroids[i], self.centroids[i]))
         if self.min_step:
             if max(centroid_movements) < self.min_step:
                 return True
@@ -86,5 +89,5 @@ def _random_centroids(self, data, k):
         """Return randomly generated centroids."""
         k_list = []
         for i in range(k):
-            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in range(len(data) - 2)])
+            k_list.append([random.uniform(min(data[column]), max(data[column])) for column in data.columns.values[:-2]])
         return k_list

From 31654367d5908f5b63820628057d6d451abd6b97 Mon Sep 17 00:00:00 2001
From: Ted Callahan <CCallahanIV@gmail.com>
Date: Wed, 22 Feb 2017 16:11:12 -0800
Subject: [PATCH 09/10] Removed print statement.

---
 src/kmeans.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/kmeans.py b/src/kmeans.py
index 5d772f2..40f5ee0 100644
--- a/src/kmeans.py
+++ b/src/kmeans.py
@@ -24,7 +24,6 @@ def fit(self, data, k=2):
         old_centroids = None
 
         while not self._should_stop(old_centroids, iteration, k):
-            print("Iteration: ", iteration)
             old_centroids = self.centroids
             iteration += 1
             data = self._classify(data)

From f0dbf7f664ecfa2c58a89c14225444256766794c Mon Sep 17 00:00:00 2001
From: pasaunders <kaorti@gmail.com>
Date: Thu, 23 Feb 2017 11:47:38 -0800
Subject: [PATCH 10/10] updated readme

---
 README.MD | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/README.MD b/README.MD
index e12ed7d..d5c3423 100644
--- a/README.MD
+++ b/README.MD
@@ -57,6 +57,11 @@ in Python containing the following methods:
     traversal: O(k) + O(2n)
 
 
-Kmeans:
-    collaboration note - this program was a collaboration between Ted,
-    Patrick, Rick, Julien, and Avery
\ No newline at end of file
+##K-Means Classifier
+    Establish *k* nodes, each one representing a centroid, or cluster of data. For each node desired then, the algorithm positions that centroid at the point where the distance between it and the nearest points is on average smaller than the distance between those points and the next node.
+
+    Public methods:
+        clf.fit(self, data, k): Generates k centroids with which to classify the given data
+        clf.predict(self, data): Returns classes for some data if clf.fit has already been called.
+
+    Work done in collaboration with [Julien Wilson](https://github.com/julienawilson), [Ted Callahan](https://github.com/CCallahanIV), [Patrick Saunders](https://github.com/pasaunders) and [Avery Pratt](https://github.com/AveryPratt).
\ No newline at end of file