In [1]:
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib
import datetime as dt

matplotlib.style.use('ggplot') # Look Pretty

def showandtell(title=None):
  if title != None: plt.savefig(title + ".png", bbox_inches='tight', dpi=300)
  plt.show()
  # exit()

In [2]:
#
# INFO: This dataset has call records for 10 users tracked over the course of 3 years.
# Your job is to find out where the users likely live and work at!

In [3]:
# TODO: Load up the dataset and take a peek at its head
# Convert the date using pd.to_datetime, and the time using pd.to_timedelta
df = pd.read_csv('Datasets/CDR.csv')
df.CallDate = pd.to_datetime(df.CallDate)
df.Duration = pd.to_timedelta(df.Duration)
df.head(5)


Out[3]:
In Out Direction CallDate CallTime DOW Duration TowerID TowerLat TowerLon
0 4638472273 2666307251 Incoming 2010-12-25 07:16:24.736813 Sat 00:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417
1 4638472273 1755442610 Incoming 2010-12-25 21:18:30.053710 Sat 00:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
2 4638472273 5481755331 Incoming 2010-12-25 14:52:42.878016 Sat 00:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 32.899944 -96.910389
3 4638472273 1755442610 Incoming 2010-12-25 16:02:09.001913 Sat 00:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00 32.899944 -96.910389
4 4638472273 2145623768 Incoming 2010-12-25 15:28:35.028554 Sat 00:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807 32.899944 -96.910389

In [4]:
#
# TODO: Get a distinct list of "In" phone numbers (users) and store the values in a
# regular python list.
# Hint: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tolist.html
#
phonenumber = list(df.In.unique())
phonenumber


Out[4]:
[4638472273,
 1559410755,
 4931532174,
 2419930464,
 1884182865,
 3688089071,
 4555003213,
 2068627935,
 2894365987,
 8549533077]

In [5]:
# TODO: Create a slice called user1 that filters to only include dataset records where the
# "In" feature (user phone number) is equal to the first number on your unique list above;
# that is, the very first number in the dataset
user = 4638472273
user1 = df[df.In == user]

In [6]:
# INFO: Plot all the call locations
user1.plot.scatter(x='TowerLon', y='TowerLat', c='gray', alpha=0.1, title='Call Locations')
#showandtell()  # Comment this line out when you're ready to proceed


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1112dc320>

In [7]:
# INFO: The locations map above should be too "busy" to really wrap your head around. This
# is where domain expertise comes into play. Your intuition tells you that people are likely
# to behave differently on weekends:
#
# On Weekends:
#   1. People probably don't go into work
#   2. They probably sleep in late on Saturday
#   3. They probably run a bunch of random errands, since they couldn't during the week
#   4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
#   1. People probably are at work during normal working hours
#   2. They probably are at home in the early morning and during the late night
#   3. They probably spend time commuting between work and home everyday

In [8]:
# TODO: Add more filters to the user1 slice you created. Add bitwise logic so that you're
# only examining records that came in on weekends (sat/sun).
#
user1 = user1[user1['DOW'].isin(['Sat', 'Sun'])]

# TODO: Further filter it down for calls that came in either before 6AM OR after 10pm (22:00:00).
# You can use < and > to compare the string times, just make sure you code them as military time
# strings, eg: "06:00:00", "22:00:00": https://en.wikipedia.org/wiki/24-hour_clock
#
# You might also want to review the Data Manipulation section for this. Once you have your filtered
# slice, print out its length:
#
user1 = user1[(user1.CallTime < "06:00:00")|(user1.CallTime >  "22:00:00")]
user1


Out[8]:
In Out Direction CallDate CallTime DOW Duration TowerID TowerLat TowerLon
940 4638472273 6150928001 Incoming 2011-01-08 22:22:33.693878 Sat 00:00:35.065912 26f7441e-e64b-4b9d-8c5a-0eb34f7e919f 32.731611 -96.709417
1875 4638472273 2145623768 Incoming 2011-01-22 22:21:27.466908 Sat 00:03:40.018320 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
6949 4638472273 5621270003 Missed 2011-04-09 22:59:23.206408 Sat 00:03:06.567364 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
7468 4638472273 7756914135 Incoming 2011-04-17 05:15:40.205917 Sun 00:02:09.984993 cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc 32.731722 -96.709500
9753 4638472273 7922223441 Incoming 2011-05-21 05:08:20.121145 Sat 00:01:30.059591 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417
13951 4638472273 7428841195 Missed 2011-07-23 22:56:02.014834 Sat 00:04:48.497077 26f7441e-e64b-4b9d-8c5a-0eb34f7e919f 32.731611 -96.709417
17515 4638472273 2666307251 Incoming 2011-09-11 05:44:15.432523 Sun 00:01:39.500414 bcec0121-164d-406c-8c04-d12870893992 32.731611 -96.709417
21638 4638472273 6150928001 Incoming 2011-11-13 05:10:41.954161 Sun 00:00:44.369613 f3f289f5-7d29-4d3f-9b9a-a0ceaca10a1d 32.750556 -96.694722
22045 4638472273 6022767590 Missed 2011-11-19 22:59:53.166951 Sat 00:00:05.795265 8ba7e0d7-eb6e-4828-9c70-c469954a627d 32.731611 -96.709417
22502 4638472273 5621270003 Incoming 2011-11-26 22:10:48.879076 Sat 00:03:07.592270 bcec0121-164d-406c-8c04-d12870893992 32.731611 -96.709417
27866 4638472273 2946222380 Missed 2012-02-12 22:48:34.319367 Sun 00:03:24.762196 1bfc786a-d4c2-464c-bc45-b9aad0c81216 32.731722 -96.709500
31975 4638472273 1755442610 Incoming 2012-04-14 05:42:32.004437 Sat 00:07:02.657988 d760c426-4d58-4f9c-9950-b68a3a54612c 32.731611 -96.709417
32605 4638472273 7756914135 Incoming 2012-04-22 22:44:21.992759 Sun 00:02:37.995933 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417
32606 4638472273 7922223441 Incoming 2012-04-22 05:02:55.678110 Sun 00:03:50.596153 8ba7e0d7-eb6e-4828-9c70-c469954a627d 32.731611 -96.709417
34427 4638472273 5621270003 Incoming 2012-05-19 05:42:45.405430 Sat 00:03:33.104834 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
38110 4638472273 2145623768 Incoming 2012-07-15 05:59:38.556670 Sun 00:04:29.583137 1bfc786a-d4c2-464c-bc45-b9aad0c81216 32.731722 -96.709500
38564 4638472273 5481755331 Incoming 2012-07-22 22:14:37.361635 Sun 00:01:44.568702 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
39870 4638472273 8180005891 Incoming 2012-08-12 22:31:22.519784 Sun 00:00:12.541785 26f7441e-e64b-4b9d-8c5a-0eb34f7e919f 32.731611 -96.709417
40275 4638472273 7756914135 Missed 2012-08-19 05:33:32.103650 Sun 00:01:35.932559 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
40789 4638472273 7428841195 Incoming 2012-08-26 22:23:58.453302 Sun 00:06:08.978230 bcec0121-164d-406c-8c04-d12870893992 32.731611 -96.709417
41757 4638472273 7922223441 Incoming 2012-09-08 22:45:53.008411 Sat 00:04:26.260140 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
41758 4638472273 7428841195 Incoming 2012-09-08 22:19:08.522062 Sat 00:04:18.402811 bcec0121-164d-406c-8c04-d12870893992 32.731611 -96.709417
42224 4638472273 7922223441 Incoming 2012-09-15 05:14:40.655349 Sat 00:04:07.561627 cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc 32.731722 -96.709500
44498 4638472273 7721472178 Incoming 2012-10-21 22:00:09.753350 Sun 00:03:24.773041 d760c426-4d58-4f9c-9950-b68a3a54612c 32.731611 -96.709417
44927 4638472273 1755442610 Incoming 2012-10-27 05:47:54.723850 Sat 00:04:28.876617 1bfc786a-d4c2-464c-bc45-b9aad0c81216 32.731722 -96.709500
44994 4638472273 7922223441 Incoming 2012-10-28 05:24:17.234557 Sun 00:02:00.473836 bde84490-896f-4060-8bd0-bd7a90cc20d9 32.750556 -96.694722
49203 4638472273 8068555700 Incoming 2012-12-29 05:08:40.433945 Sat 00:03:10.280023 aeaf8b43-8034-44fe-833d-31854a75acbf 32.731722 -96.709500
51586 4638472273 2946222380 Incoming 2013-02-03 22:14:12.703233 Sun 00:04:05.550748 cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc 32.731722 -96.709500

In [11]:
#
# INFO: Visualize the dataframe with a scatter plot as a sanity check. Since you're familiar with maps, you know well that your
# X-Coordinate should be Longitude, and your Y coordinate should be the tower Latitude. Check the dataset headers for proper column
# feature names.
# https://en.wikipedia.org/wiki/Geographic_coordinate_system#Geographic_latitude_and_longitude
#
# At this point, you don't yet know exactly where the user is located just based off the cell phone tower position data; but
# considering the below are for Calls that arrived in the twilight hours of weekends, it's likely that wherever they are bunched up
# is probably near the caller's residence:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
showandtell()  # TODO: Comment this line out when you're ready to proceed

In [14]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 2)
user1 = pd.concat([user1.TowerLon, user1.TowerLat], axis = 1)
labels = kmeans.fit_predict(user1)

centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)

plt.show()  # TODO: Comment this line out when you're ready to proceed

In [16]:
locations = []
for i in range(10):
	user = df[(df.In == phonenumber[i])]
	user.plot.scatter(x='TowerLon', y='TowerLat', c='purple', alpha=0.12, title='Call Locations', s = 30)
	user = user[(user.DOW == 'Sat') | (user.DOW == 'Sun')]
	user = user[(user.CallTime < "06:00:00") | (user.CallTime > "22:00:00")]
	fig = plt.figure()
	ax = fig.add_subplot(111)
	ax.scatter(user.TowerLon, user.TowerLat, c='g', marker='o', alpha=0.2)
	ax.set_title('Weekend Calls (<6am or >10p)')
	kmeans = KMeans(n_clusters = 2)
	user = pd.concat([user.TowerLon, user.TowerLat], axis = 1)
	labels = kmeans.fit_predict(user)

	centroids = kmeans.cluster_centers_
	ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)
	locations.append(centroids)
showandtell()

In [ ]: