In [37]:
import random
from array import *
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import distance

from ipywidgets import IntSlider
from ipywidgets.embed import embed_minimal_html

slider = IntSlider(value=40)
embed_minimal_html('export.html', views=[slider], title='Widgets export')

Part I

-First we will create a random data set. Then we will feed the data through the initial process of K-Means 
to verify everything functions correctly.

- The random data set

In [2]:
def initializeDataSet():
    for i in range(20):
        randomCoordinate = random.randint(1,100), random.randint(1,100)
        dataSet.append(randomCoordinate)

def convertDataSet():
    for i in range(len(dataSet)):
        xValues.append(dataSet[i][0]) 
        yValues.append(dataSet[i][1]) 
In [3]:
dataSet = []
xValues = []
yValues = []

initializeDataSet();
convertDataSet();
    
plt.plot(xValues,yValues, 'ko')
    
Out[3]:
[<matplotlib.lines.Line2D at 0x61880aba8>]
In [4]:
def findMaxOfX(dataSet):
    maxValue = max(dataSet)
    return maxValue[0]
    
def findMaxOfY(dataSet):
    maxValue = max(dataSet, key=lambda x: x[1])
    return maxValue[1]

-Feed through K-Means

Step 1: Create k numbers of random center points

In [5]:
def plotCenterPoints():    #To be used in createRandomCentroids()
    centerPointColor = ['bD','rD','gD','yD']
    for i in range(len(centroids)):
        plt.plot(centroids[i][0],centroids[i][1], centerPointColor[i])
        
def createRandomCentroids(dataSet, k):
    centroids.clear()
    for i in range(k):
        randomCoordinate =  random.randint(1,findMaxOfX(dataSet)), random.randint(1,findMaxOfY(dataSet))
        centroids.append(randomCoordinate)
In [6]:
centroids = []
createRandomCentroids(dataSet, 2)
plotCenterPoints()
plt.plot(xValues,yValues,'ko')
Out[6]:
[<matplotlib.lines.Line2D at 0x61bb3d7b8>]

Step 2: Calculate Euclidean distance between data set and centroids

Pseudocode:

For each data in the dataSet:
    For each centroid:
        get the distance between the data and each centroid and append to listOfDistances
    get the minimum distance from the list and append to nearestCentroid
    clear the listOfDistances after each iteration
In [7]:
def getDistance(firstCoord, secondCoord):    #To be used in getListOfDistances()
    return distance.euclidean(firstCoord, secondCoord)

def getMinimumDistance():    #To be used in getListOfDistances()
    minimum = min(listOfDistances, key=lambda x: x[1])
    nearestCentroid.append(minimum[0]) #minimum[0] = list number
    
def createListOfShortestDistances():
    distanceData = []
    nearestCentroid.clear()
    for i in range(len(dataSet)):
        for j in range(len(centroids)):
            distanceData = j, getDistance(dataSet[i], centroids[j])
            listOfDistances.append(distanceData)
        getMinimumDistance()
        listOfDistances.clear()
In [8]:
listOfDistances = []
nearestCentroid = []

createListOfShortestDistances()
print(nearestCentroid)
[1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0]

Step 3: Cluster the data based on the nearest centroid

In [9]:
def createClusterIndex():    #needs to be included in createClusters()
    cluster_xValues.clear()
    cluster_yValues.clear()
    for i in range(len(centroids)):
        cluster_xValues.append([i])
        cluster_yValues.append([i])
    
def createClusters():
    createClusterIndex()
    for i in range(len(nearestCentroid)):
        index = nearestCentroid[i]
        cluster_xValues[index].append(xValues[i])
        cluster_yValues[index].append(yValues[i])
In [10]:
colors = ['bo', 'ro', 'go', 'yo', 'co', 'mo', 'ko']

cluster_xValues = []
cluster_yValues = []

createClusters()
In [11]:
def plotClusters():
    for i in range(len(cluster_xValues)):
        for j in range(len(cluster_xValues[i])-1):
            plt.plot(cluster_xValues[i][j+1],cluster_yValues[i][j+1], colors[i])
 
In [12]:
plotClusters()
plotCenterPoints()

Step 4: Move the centroids to the average of the cluster

Pseudocode:

    Clear the centroids (list of x,y coords for each centroid)
    Get the average of the x values and the average of y values of each cluster.
    Append each average to centroids
In [13]:
def getAverage(list, index):
    sumOfValues = 0;
    length = 0;
    for i in range(len(list[index])-1):
        sumOfValues = sumOfValues + list[index][i+1]
        length = length + 1
    if(length > 0):
        return sumOfValues/length
    else:
        return 0

def shiftCentroids():
    distance = 0
            
    if(len(xAverage) == 0):
        for i in range(len(centroids)):
            
            nu_xAverage = getAverage(cluster_xValues, i)
            nu_yAverage = getAverage(cluster_yValues, i)
            
            xAverage.append(nu_xAverage)
            yAverage.append(nu_yAverage)
            
            centroids[i] = (nu_xAverage, nu_yAverage)
            
        return False
    else: 
        for i in range(len(centroids)):
            nu_xAverage = getAverage(cluster_xValues, i)
            nu_yAverage = getAverage(cluster_yValues, i)
                        
            distance = distance + (xAverage[i] - nu_xAverage)
            distance = distance + (yAverage[i] - nu_yAverage)
    
            xAverage[i] = nu_xAverage
            yAverage[i] = nu_yAverage
            
            centroids[i] = (nu_xAverage, nu_yAverage)
            
    if distance == 0:
        return True
    else:
        return False
    
def plotNewCentroids():
    for i in range(len(centroids)):
        plt.plot(xAverage[i],yAverage[i], centerPointColors[i])
In [14]:
xAverage = []
yAverage = []
centerPointColors = ['bd', 'rd', 'gd', 'yd', 'cd', 'md', 'kd']
shiftCentroids()
plotNewCentroids()
plotClusters()

Step 5: Check to see if the shift in centroid will change the data's cluster classification

In [15]:
def checkToSeeIfDataChangesClusters():
    nearestCentroid.clear()
    createListOfShortestDistances()
    createClusters()
    
checkToSeeIfDataChangesClusters()
plotClusters()
plotNewCentroids()

Step 6: Check to see if the center points shift to a new average

In [16]:
shiftCentroids()
plotClusters()
plotNewCentroids()
print(centroids)
[(37.5, 56.166666666666664), (76.625, 29.0)]

Part II

  • Now that all of the functionality has been verified, we can create the actual K-Means algorithm.
  • With K-Means creted we can test the rate of convergence of the algorithm.
  • NOTE: Can only create up to three clusters due to the limitation of colors in plt.plot
In [17]:
def kMeans(dataSet, k):
    xAverage.clear()
    yAverage.clear()
    convergence = False;
    trainingCycles = 1    
    createRandomCentroids(dataSet, k)
    createListOfShortestDistances()
    createClusters()
    while (convergence == False) and (trainingCycles < 100):
        convergence = shiftCentroids()
        checkToSeeIfDataChangesClusters()
        trainingCycles = trainingCycles + 1 
    return trainingCycles

def getTrainingCycles(nuDataSet, k):
    print("Total cycles until convergence: " + str(kMeans(nuDataSet, k)))
    plotClusters()
    plotNewCentroids()
In [18]:
getTrainingCycles(dataSet, 2)
Total cycles until convergence: 5
In [19]:
getTrainingCycles(dataSet, 3)
Total cycles until convergence: 5
In [20]:
getTrainingCycles(dataSet, 4)
Total cycles until convergence: 4
In [21]:
getTrainingCycles(dataSet, 5)
Total cycles until convergence: 6
In [22]:
getTrainingCycles(dataSet, 6)
Total cycles until convergence: 4
In [23]:
getTrainingCycles(dataSet, 7)
Total cycles until convergence: 3
In [24]:
dataSet = []
xValues = []
yValues = []

for i in range(10):
    randomCoordinate = random.randint(1,25), random.randint(1,25)
    dataSet.append(randomCoordinate)
for i in range(9):
    randomCoordinate = random.randint(75,100), random.randint(75,100)
    dataSet.append(randomCoordinate)
dataSet.append((50,50))
convertDataSet()
print(dataSet)
[(3, 21), (2, 19), (14, 10), (23, 6), (8, 12), (10, 14), (3, 11), (5, 15), (24, 8), (23, 10), (86, 84), (82, 93), (76, 84), (88, 94), (92, 90), (97, 75), (98, 88), (93, 75), (94, 84), (50, 50)]
In [25]:
getTrainingCycles(dataSet, 2)
Total cycles until convergence: 4
In [26]:
getTrainingCycles(dataSet, 4)
Total cycles until convergence: 3
In [27]:
print(centroids)
[(0, 0), (89.55555555555556, 85.22222222222223), (11.5, 12.6), (50.0, 50.0)]
In [28]:
for i in range(len(centroids)):
        print(xAverage[i],yAverage[i])
0 0
89.55555555555556 85.22222222222223
11.5 12.6
50.0 50.0

Part III

  • Now that we know the K-Means algorithm works, we can implement it into a Monte-Carlo method to examine the average convergance rate for dataSets.
  • With the Monte-Carlo method created, we can then create a createClusteredDataSet method to create clustered data sets of varying size and spread.
  • With both methods created, we can examine how size and spread affects the average convergence rate.
In [29]:
def monteCarlo(nuDataSet, k):
    n = 1000
    counter = 0
    for i in range(n):
        counter = counter + kMeans(nuDataSet, k)
    Average = counter/n
    print("Average Convergence Rate: " + str(Average) + " training cycles")
In [30]:
monteCarlo(dataSet, 4)
Average Convergence Rate: 4.142 training cycles
In [31]:
def createClusteredDataSet(clusterSize, spread):
    dataSet.clear()
    xValues.clear()
    yValues.clear()
    
    for i in range(clusterSize):
        randomCoordinate = random.randint(1,spread), random.randint(1,spread)
        dataSet.append(randomCoordinate)
    for i in range(clusterSize):
        randomCoordinate = random.randint(75,75 + spread), random.randint(75,75+spread)
        dataSet.append(randomCoordinate)
    convertDataSet()

    

Now we can examine how spread affects convergence

In [32]:
createClusteredDataSet(10,15)
plt.plot(xValues,yValues, 'ko')
Out[32]:
[<matplotlib.lines.Line2D at 0x61c28b6a0>]
In [33]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()
Average Convergence Rate: 3.214 training cycles
In [34]:
createClusteredDataSet(10,30)
plt.plot(xValues,yValues, 'ko')
Out[34]:
[<matplotlib.lines.Line2D at 0x61c3d04e0>]
In [35]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()
Average Convergence Rate: 3.453 training cycles
In [36]:
createClusteredDataSet(10,60)
plt.plot(xValues,yValues, 'ko')
Out[36]:
[<matplotlib.lines.Line2D at 0x61c5d1da0>]
In [ ]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()
In [ ]:
createClusteredDataSet(10,120)
plt.plot(xValues,yValues, 'ko')
In [ ]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()

Now we can examine how size affects convergence

In [ ]:
createClusteredDataSet(20,60)
plt.plot(xValues,yValues, 'ko')
In [ ]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()
In [ ]:
createClusteredDataSet(40,60)
plt.plot(xValues,yValues, 'ko')
In [ ]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()
In [ ]:
createClusteredDataSet(80,60)
plt.plot(xValues,yValues, 'ko')
In [ ]:
monteCarlo(dataSet, 2)
plotClusters()
plotNewCentroids()