In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import datetime as dt
matplotlib.style.use('ggplot') # Look Pretty
def showandtell(title=None):
if title != None: plt.savefig(title + ".png", bbox_inches='tight', dpi=300)
plt.show()
# exit()
In [2]:
#
# INFO: This dataset has call records for 10 users tracked over the course of 3 years.
# Your job is to find out where the users likely live and work at!
In [3]:
# TODO: Load up the dataset and take a peek at its head
# Convert the date using pd.to_datetime, and the time using pd.to_timedelta
df = pd.read_csv('Datasets/CDR.csv')
df.CallDate = pd.to_datetime(df.CallDate)
df.Duration = pd.to_timedelta(df.Duration)
df.head(5)
Out[3]:
In [4]:
#
# TODO: Get a distinct list of "In" phone numbers (users) and store the values in a
# regular python list.
# Hint: https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tolist.html
#
phonenumber = list(df.In.unique())
phonenumber
Out[4]:
In [5]:
# TODO: Create a slice called user1 that filters to only include dataset records where the
# "In" feature (user phone number) is equal to the first number on your unique list above;
# that is, the very first number in the dataset
user = 4638472273
user1 = df[df.In == user]
In [6]:
# INFO: Plot all the call locations
user1.plot.scatter(x='TowerLon', y='TowerLat', c='gray', alpha=0.1, title='Call Locations')
#showandtell() # Comment this line out when you're ready to proceed
Out[6]:
In [7]:
# INFO: The locations map above should be too "busy" to really wrap your head around. This
# is where domain expertise comes into play. Your intuition tells you that people are likely
# to behave differently on weekends:
#
# On Weekends:
# 1. People probably don't go into work
# 2. They probably sleep in late on Saturday
# 3. They probably run a bunch of random errands, since they couldn't during the week
# 4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
# 1. People probably are at work during normal working hours
# 2. They probably are at home in the early morning and during the late night
# 3. They probably spend time commuting between work and home everyday
In [8]:
# TODO: Add more filters to the user1 slice you created. Add bitwise logic so that you're
# only examining records that came in on weekends (sat/sun).
#
user1 = user1[user1['DOW'].isin(['Sat', 'Sun'])]
# TODO: Further filter it down for calls that came in either before 6AM OR after 10pm (22:00:00).
# You can use < and > to compare the string times, just make sure you code them as military time
# strings, eg: "06:00:00", "22:00:00": https://en.wikipedia.org/wiki/24-hour_clock
#
# You might also want to review the Data Manipulation section for this. Once you have your filtered
# slice, print out its length:
#
user1 = user1[(user1.CallTime < "06:00:00")|(user1.CallTime > "22:00:00")]
user1
Out[8]:
In [11]:
#
# INFO: Visualize the dataframe with a scatter plot as a sanity check. Since you're familiar with maps, you know well that your
# X-Coordinate should be Longitude, and your Y coordinate should be the tower Latitude. Check the dataset headers for proper column
# feature names.
# https://en.wikipedia.org/wiki/Geographic_coordinate_system#Geographic_latitude_and_longitude
#
# At this point, you don't yet know exactly where the user is located just based off the cell phone tower position data; but
# considering the below are for Calls that arrived in the twilight hours of weekends, it's likely that wherever they are bunched up
# is probably near the caller's residence:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
showandtell() # TODO: Comment this line out when you're ready to proceed
In [14]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2)
user1 = pd.concat([user1.TowerLon, user1.TowerLat], axis = 1)
labels = kmeans.fit_predict(user1)
centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)
plt.show() # TODO: Comment this line out when you're ready to proceed
In [16]:
locations = []
for i in range(10):
user = df[(df.In == phonenumber[i])]
user.plot.scatter(x='TowerLon', y='TowerLat', c='purple', alpha=0.12, title='Call Locations', s = 30)
user = user[(user.DOW == 'Sat') | (user.DOW == 'Sun')]
user = user[(user.CallTime < "06:00:00") | (user.CallTime > "22:00:00")]
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user.TowerLon, user.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
kmeans = KMeans(n_clusters = 2)
user = pd.concat([user.TowerLon, user.TowerLat], axis = 1)
labels = kmeans.fit_predict(user)
centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)
locations.append(centroids)
showandtell()
In [ ]: