diff --git a/personal_transaction_clustering.ipynb b/personal_transaction_clustering.ipynb new file mode 100644 index 0000000..e23a51f --- /dev/null +++ b/personal_transaction_clustering.ipynb @@ -0,0 +1,404 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from sklearn.cluster import KMeans\n", + "import pymysql.cursors\n", + "import pandas as pd\n", + "import numpy as np\n", + "import csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "pwd = \"password\"\n", + "user_name = \"root\"\n", + "host_loc = \"localhost\"\n", + "data_base = \"database_name\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "dbCON = pymysql.connect(user=user_name, password=pwd, host=host_loc, database=data_base)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "selector = dbCON.cursor()\n", + "selector.execute(\"SELECT * FROM transaction\")\n", + "rows = selector.fetchall()\n", + "df = pd.DataFrame(data = np.array(rows))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "dbCON.close() " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Pre-Processing Data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.get_dummies(df, columns = [3])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "del df[6], df[1], df[2], df[4]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: convert_objects is deprecated. To re-infer data dtypes for object columns, use DataFrame.infer_objects()\n", + "For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.\n", + " \"\"\"Entry point for launching an IPython kernel.\n" + ] + } + ], + "source": [ + "df = df.convert_objects(convert_numeric = True).fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
053_3_ATRID3_DIGRID3_DIGTRID3_FTRID3_FTRID3_HTRID3_PHTRID3_PRTRID3_SVTRID3_TTRID
000.0010000000000
1302-8.3400001000000
2302-4.2000001000000
3501-10.0000000000001
4302-25.0000001000000
\n", + "
" + ], + "text/plain": [ + " 0 5 3_ 3_ATRID 3_DIGRID 3_DIGTRID 3_FTRID 3_FTRID 3_HTRID \\\n", + "0 0 0.00 1 0 0 0 0 0 0 \n", + "1 302 -8.34 0 0 0 0 1 0 0 \n", + "2 302 -4.20 0 0 0 0 1 0 0 \n", + "3 501 -10.00 0 0 0 0 0 0 0 \n", + "4 302 -25.00 0 0 0 0 1 0 0 \n", + "\n", + " 3_PHTRID 3_PRTRID 3_SVTRID 3_TTRID \n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 1 \n", + "4 0 0 0 0 " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "X = df.values" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0. , 0. , 1. , ..., 0. , 0. , 0. ],\n", + " [302. , -8.34, 0. , ..., 0. , 0. , 0. ],\n", + " [302. , -4.2 , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [302. , -0.79, 0. , ..., 0. , 0. , 0. ],\n", + " [302. , -4. , 0. , ..., 0. , 0. , 0. ],\n", + " [302. , -11.62, 0. , ..., 0. , 0. , 0. ]])" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Calculating the \"Best\" Value for `k`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "kmeans_fit = [KMeans(n_clusters = iter_val).fit(X) for iter_val in range(5,50,5)]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "scores = [(iter_val, model.score(X, model.predict(X))) for iter_val, model in enumerate(kmeans_fit)]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "scoresDF = pd.DataFrame(scores)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYAAAAEDCAYAAAA849PJAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAE1xJREFUeJzt3X+snuV93/H3J7ZJTpKlJgGBbUKNVuaVhS7uzkgISzoFM5OsCg5aJlDbkS2RWzWpknX1BEPatE0TbN7aalpXyYN0zpZBKDE/trA6/IhEKjUZh5jIgOtAaH74B9hJ6qasngLOd3+c28nh5Jzjc/w859yPz/V+SUfn/nE9z/WVMdfnua/7uh+nqpAktedVfRcgSeqHASBJjTIAJKlRBoAkNcoAkKRGGQCS1KiRD4Akn0hyJMmT82j7W0me6H6+muTYUtQoSWeijPpzAEneBbwIfLKq3rKA1/0asLGq/tGiFSdJZ7CRvwKoqkeB7049luQvJ/mDJI8n+UKSvzrDS68H7liSIiXpDLSy7wJO0w7gV6rqmSRvA/4z8O6TJ5P8JHAR8EhP9UnSyDvjAiDJ64F3AL+f5OThV09rdh1wd1WdWMraJOlMcsYFAJPTVseq6q1ztLkO+MgS1SNJZ6SRvwcwXVV9D/iTJB8AyKS/fvJ8dz/gbOCPeipRks4IIx8ASe5gcjDfkORAkg8BvwB8KMlXgKeAa6a85Drgzhr15U2S1LORXwYqSVocI38FIElaHCN9E/icc86p9evX912GJJ0xHn/88W9X1bnzaTvSAbB+/XomJib6LkOSzhhJvjHftk4BSVKjDABJatRQAiDJ1Un2J3k2yY0znH91kk9357+UZP0w+pUknb6BAyDJCuB3gPcAlwDXJ7lkWrMPAX9aVT8F/BbwbwftV5I0mGFcAVwGPFtVz1XV94E7eeWDWXT7O7vtu4ErM+WLfCRJS28Yq4DWAd+asn8AeNtsbarq5SR/BrwJ+PYQ+pekZeHePQfZvns/h44dZ+3qMbZt3sCWjesWrb+RWwaaZCuwFeDCCy/suRpJy9FSD7TzremmXXs5/tLklxgfPHacm3btBVi02oYxBXQQePOU/Qu6YzO2SbIS+AngOzO9WVXtqKrxqho/99x5PcsgaUTdu+cgV9z6CBfd+FmuuPUR7t0zfWjop6abdu3l4LHjFD8aaPuubfvu/T8c/E86/tIJtu/ev2h9DiMAHgMuTnJRkrOY/DK2+6e1uR+4odv+e8AjflmbtLw50C7MoWPHF3R8GAYOgKp6GfgosBvYB9xVVU8l+VdJ3tc1ux14U5JngV8HfmypqKTTN4qftB1oF2bt6rEFHR+GodwDqKoHgAemHfvnU7b/H/CBYfQl6ZX6mDuej1EeaA/OUMNiDrTzsW3zhlf8dwQYW7WCbZs3LFqfPgksneFG9ZN2H59o52Pb5g2MrVrximOLPdDOx5aN67jl2ktZt3qMAOtWj3HLtZe2tQpI0sKM6iftPj7RzsfJAXXUVgHBZG1LWYcBIC3AKC4fHNUpDQfa0WcASPM0qnPto/pJGxxoR533AKR5GtW59j7mjrU8eAUgzdOozrWDn7R1erwCkOZpVFe1SKfLAJDmaVSXD0qnyykgaZ5GeVWLdDoMAI2kUVxuCc61a3kxADRyRnW5pbTceA9AI2dUl1tKy40BoJEzysstpeXEANDIcbmltDQMAI0cl1tKS8ObwBo5LreUloYBoJHkcktp8TkFJEmNMgAkqVEGgCQ1ygCQpEYZAJLUqIFWASV5I/BpYD3wdeDvV9WfTmvzVuB3gTcAJ4B/U1WfHqRfDc+ofumapMU36BXAjcDDVXUx8HC3P91fAP+gqv4acDXw20lWD9ivhuDkl64dPHac4kdfunbvnoN9lyZpCQwaANcAO7vtncCW6Q2q6qtV9Uy3fQg4Apw7YL8aAr90TWrboAFwXlUd7rafB86bq3GSy4CzgK/N0WZrkokkE0ePHh2wPM3FL12T2nbKewBJHgLOn+HUzVN3qqqS1Bzvswb4b8ANVfWD2dpV1Q5gB8D4+Pis76fBrV09xsEZBnu/dE1qwykDoKo2zXYuyQtJ1lTV4W6APzJLuzcAnwVurqovnna1Gqptmze84h9eAb90TWrJoFNA9wM3dNs3APdNb5DkLOAe4JNVdfeA/WmItmxcxy3XXsq61WMEWLd6jFuuvdRVQFIjUnX6syxJ3gTcBVwIfIPJZaDfTTIO/EpVfTjJLwK/Bzw15aUfrKonTvX+4+PjNTExcdr1SVJrkjxeVePzajtIACw2A0CSFmYhAeCTwJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjVvZdQCvu3XOQ7bv3c+jYcdauHmPb5g1s2biu77IkNcwAWAL37jnITbv2cvylEwAcPHacm3btBTAEJPXGKaAlsH33/h8O/icdf+kE23fv76kiSRpCACR5Y5IHkzzT/T57jrZvSHIgyX8atN8zyaFjxxd0XJKWwjCuAG4EHq6qi4GHu/3Z/Gvg0SH0eUZZu3psQcclaSkMIwCuAXZ22zuBLTM1SvI3gPOAzw2hzzPKts0bGFu14hXHxlatYNvmDT1VJEnDCYDzqupwt/08k4P8KyR5FfAfgN841Zsl2ZpkIsnE0aNHh1Be/7ZsXMct117KutVjBFi3eoxbrr3UG8CSejWvVUBJHgLOn+HUzVN3qqqS1AztfhV4oKoOJJmzr6raAewAGB8fn+m9zkhbNq5zwJc0UuYVAFW1abZzSV5IsqaqDidZAxyZodnlwDuT/CrweuCsJC9W1Vz3CyRJi2gYzwHcD9wA3Nr9vm96g6r6hZPbST4IjDv4S1K/hnEP4FbgqiTPAJu6fZKMJ7ltCO8vSVoEqRrdafbx8fGamJjouwxJOmMkebyqxufT1ieBJalRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowYKgCRvTPJgkme632fP0u7CJJ9Lsi/J00nWD9KvJGlwg14B3Ag8XFUXAw93+zP5JLC9qn4auAw4MmC/kqQBDRoA1wA7u+2dwJbpDZJcAqysqgcBqurFqvqLAfuVJA1o0AA4r6oOd9vPA+fN0OavAMeS7EqyJ8n2JCtme8MkW5NMJJk4evTogOVJkmaz8lQNkjwEnD/DqZun7lRVJalZ+ngnsBH4JvBp4IPA7TP1V1U7gB0A4+PjM72fJGkIThkAVbVptnNJXkiypqoOJ1nDzHP7B4Anquq57jX3Am9nlgCQJC2NQaeA7gdu6LZvAO6boc1jwOok53b77waeHrBfSdKABg2AW4GrkjwDbOr2STKe5DaAqjoB/AbwcJK9QID/MmC/kqQBnXIKaC5V9R3gyhmOTwAfnrL/IPAzg/QlSRounwSWpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJatTAAZDkjUkeTPJM9/vsWdr9uyRPJdmX5D8myaB9S5JO3zCuAG4EHq6qi4GHu/1XSPIO4ArgZ4C3AH8T+Lkh9C1JOk3DCIBrgJ3d9k5gywxtCngNcBbwamAV8MIQ+pYknaZhBMB5VXW4234eOG96g6r6I+DzwOHuZ3dV7RtC35Kk07RyPo2SPAScP8Opm6fuVFUlqRle/1PATwMXdIceTPLOqvrCDG23AlsBLrzwwvmUJ0k6DfMKgKraNNu5JC8kWVNVh5OsAY7M0Oz9wBer6sXuNf8buBz4sQCoqh3ADoDx8fEfCxNJ0nAMYwrofuCGbvsG4L4Z2nwT+LkkK5OsYvIGsFNAktSjYQTArcBVSZ4BNnX7JBlPclvX5m7ga8Be4CvAV6rqfw6hb0nSaZrXFNBcquo7wJUzHJ8APtxtnwB+edC+JEnD45PAktQoA0CSGmUASFKjDABJapQBIEmNMgAkqVEGgCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQMFQJIPJHkqyQ+SjM/R7uok+5M8m+TGQfqUJA3HoFcATwLXAo/O1iDJCuB3gPcAlwDXJ7lkwH4lSQNaOciLq2ofQJK5ml0GPFtVz3Vt7wSuAZ4epG9J0mCW4h7AOuBbU/YPdMdmlGRrkokkE0ePHl304iSpVae8AkjyEHD+DKdurqr7hl1QVe0AdgCMj4/XsN9fkjTplAFQVZsG7OMg8OYp+xd0xyRJPVqKKaDHgIuTXJTkLOA64P4l6FeSNIdBl4G+P8kB4HLgs0l2d8fXJnkAoKpeBj4K7Ab2AXdV1VODlS1JGtSgq4DuAe6Z4fgh4L1T9h8AHhikL0nScPkksCQ1ygCQpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRBoAkNcoAkKRGGQCS1CgDQJIaZQBIUqMMAElqlAEgSY0yACSpUQaAJDXKAJCkRhkAktQoA0CSGmUASFKjDABJatRAAZDkA0meSvKDJOOztHlzks8nebpr+7FB+pQkDcegVwBPAtcCj87R5mXgn1TVJcDbgY8kuWTAfiVJA1o5yIurah9AkrnaHAYOd9t/nmQfsA54epC+JUmDWdJ7AEnWAxuBL83RZmuSiSQTR48eXarSJKk5p7wCSPIQcP4Mp26uqvvm21GS1wOfAT5eVd+brV1V7QB2AIyPj9d831+StDCnDICq2jRoJ0lWMTn4f6qqdg36fpKkwS36FFAmbxDcDuyrqt9c7P4kSfMz6DLQ9yc5AFwOfDbJ7u742iQPdM2uAH4JeHeSJ7qf9w5UtSRpYIOuAroHuGeG44eA93bbfwjMvkxIktQLnwSWpEYZAJLUKANAkhplAEhSowwASWqUASBJjTIAJKlRAz0HMIru3XOQ7bv3c+jYcdauHmPb5g1s2biu77IkaeQsqwC4d89Bbtq1l+MvnQDg4LHj3LRrL4AhIEnTLKspoO279/9w8D/p+Esn2L57f08VSdLoWlYBcOjY8QUdl6SWLasAWLt6bEHHJallyyoAtm3ewNiqFa84NrZqBds2b+ipIkkaXcvqJvDJG72uApKkU1tWAQCTIeCAL0mntqymgCRJ82cASFKjDABJapQBIEmNMgAkqVGpqr5rmFWSo8A3TvPl5wDfHmI5w2JdC2NdC2NdC7Mc6/rJqjp3Pg1HOgAGkWSiqsb7rmM661oY61oY61qY1utyCkiSGmUASFKjlnMA7Oi7gFlY18JY18JY18I0XdeyvQcgSZrbcr4CkCTNwQCQpEYtywBIcnWS/UmeTXJj3/UAJPlEkiNJnuy7lqmSvDnJ55M8neSpJB/ruyaAJK9J8n+SfKWr61/2XdNJSVYk2ZPkf/Vdy1RJvp5kb5Inkkz0XQ9AktVJ7k7yx0n2Jbl8BGra0P0Znfz5XpKP910XQJJ/3P19fzLJHUles6j9Lbd7AElWAF8FrgIOAI8B11fV0z3X9S7gReCTVfWWPmuZKskaYE1VfTnJXwIeB7aMwJ9XgNdV1YtJVgF/CHysqr7YZ10ASX4dGAfeUFU/33c9JyX5OjBeVSPzYFOSncAXquq2JGcBr62qY33XdVI3XhwE3lZVp/vQ6bBqWcfk3/NLqup4kruAB6rqvy5Wn8vxCuAy4Nmqeq6qvg/cCVzTc01U1aPAd/uuY7qqOlxVX+62/xzYB/T+DyrUpBe73VXdT++fVpJcAPxd4La+axl1SX4CeBdwO0BVfX+UBv/OlcDX+h78p1gJjCVZCbwWOLSYnS3HAFgHfGvK/gFGYEA7EyRZD2wEvtRvJZO6qZYngCPAg1U1CnX9NvBPgR/0XcgMCvhckseTbO27GOAi4Cjwe92U2W1JXtd3UdNcB9zRdxEAVXUQ+PfAN4HDwJ9V1ecWs8/lGAA6DUleD3wG+HhVfa/vegCq6kRVvRW4ALgsSa9TZ0l+HjhSVY/3Wccc/lZV/SzwHuAj3bRjn1YCPwv8blVtBP4vMBL35AC6Kan3Ab/fdy0ASc5mcrbiImAt8Lokv7iYfS7HADgIvHnK/gXdMc2im2P/DPCpqtrVdz3TddMGnweu7rmUK4D3dXPtdwLvTvLf+y3pR7pPkFTVEeAeJqdD+3QAODDlyu1uJgNhVLwH+HJVvdB3IZ1NwJ9U1dGqegnYBbxjMTtcjgHwGHBxkou6hL8OuL/nmkZWd7P1dmBfVf1m3/WclOTcJKu77TEmb+r/cZ81VdVNVXVBVa1n8u/VI1W1qJ/Q5ivJ67qb+HTTLH8H6HXFWVU9D3wryYbu0JVAr4sLprmeEZn+6XwTeHuS13b/X17J5D25RbPs/lH4qno5yUeB3cAK4BNV9VTPZZHkDuBvA+ckOQD8i6q6vd+qgMlPtb8E7O3m2wH+WVU90GNNAGuAnd0qjVcBd1XVSC27HDHnAfdMjhusBP5HVf1BvyUB8GvAp7oPY88B/7DneoAfhuRVwC/3XctJVfWlJHcDXwZeBvawyF8JseyWgUqS5mc5TgFJkubBAJCkRhkAktQoA0CSGmUASFKjDABJapQBIEmN+v81+BkcFk2HJgAAAABJRU5ErkJggg==\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.scatter(x = scoresDF[0], y = scoresDF[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "pred = KMeans(n_clusters = 2).fit_predict(X)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}