In [40]:
pd.DataFrame.plot?
In [39]:
matplotlib?
In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot') # Look Pretty
In [36]:
# This dataset has call records for 10 users tracked over the course of 3 years.
df = pd.read_csv('data/CDR_data.csv')
df.head()
Out[36]:
In [34]:
df.info()
In [24]:
df.CallTimestamp = pd.to_datetime(df.CallTimestamp)
df.Duration = pd.to_timedelta(df.Duration)
df.info()
In [30]:
phoneowners = pd.read_csv("data/phoneowners.xlsx", dtype={1:str})
phoneowners.head()
Out[30]:
In [28]:
phoneowners.info()
In [27]:
phoneowners[phoneowners.name == "John Doe"]
Out[27]:
In [7]:
suspect_data = df[df.In = ]
In [18]:
# INFO: Plot all the call locations
%matplotlib inline
user1.plot.scatter(x='TowerLon', y='TowerLat', c='purple', alpha=0.12, title='Call Locations', s = 30)
#showandtell() # Comment this line out when you're ready to proceed
Out[18]:
In [15]:
#
# INFO: The locations map above should be too "busy" to really wrap your head around. This is where domain expertise comes into play.
# Your intuition tells you that people are likely to behave differently on weekends:
#
# On Weekends:
# 1. People probably don't go into work
# 2. They probably sleep in late on Saturday
# 3. They probably run a bunch of random errands, since they couldn't during the week
# 4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
# 1. People probably are at work during normal working hours
# 2. They probably are at home in the early morning and during the late night
# 3. They probably spend time commuting between work and home everyday
#
# TODO: Add more filters to the user1 slice you created. Add bitwise logic so that you're only examining records that came in on
# weekends (sat/sun).
#
# .. your code here ..
user1 = user1[(user1.DOW == 'Sat') | (user1.DOW == 'Sun')]
#
# TODO: Further filter it down for calls that are came in either before 6AM OR after 10pm (22:00:00). You can use < and > to compare
# the string times, just make sure you code them as military time strings, eg: "06:00:00", "22:00:00":
# https://en.wikipedia.org/wiki/24-hour_clock
#
# You might also want to review the Data Manipulation section for this. Once you have your filtered slice, print out its length:
#
# .. your code here ..
user1 = user1[(user1.CallTime < "06:00:00") | (user1.CallTime > "22:00:00")]
user1.head()
Out[15]:
In [19]:
#
# INFO: Visualize the dataframe with a scatter plot as a sanity check. Since you're familiar with maps, you know well that your
# X-Coordinate should be Longitude, and your Y coordinate should be the tower Latitude. Check the dataset headers for proper column
# feature names.
# https://en.wikipedia.org/wiki/Geographic_coordinate_system#Geographic_latitude_and_longitude
#
# At this point, you don't yet know exactly where the user is located just based off the cell phone tower position data; but
# considering the below are for Calls that arrived in the twilight hours of weekends, it's likely that wherever they are bunched up
# is probably near the caller's residence:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
showandtell() # TODO: Comment this line out when you're ready to proceed
#
# TODO: Run K-Means with a K=1. There really should only be a single area of concentration. If you notice multiple areas that are
# "hot" (multiple areas the usr spends a lot of time at that are FAR apart from one another), then increase K=2, with the goal being
# that one of the centroids will sweep up the annoying outliers; and the other will zero in on the user's approximate home location.
# Or rather the location of the cell tower closest to their home.....
#
# Be sure to only feed in Lat and Lon coordinates to the KMeans algo, since none of the other data is suitable for your purposes.
# Since both Lat and Lon are (approximately) on the same scale, no feature scaling is required. Print out the centroid locations and
# add them onto your scatter plot. Use a distinguishable marker and color.
#
# Hint: Make sure you graph the CORRECT coordinates. This is part of your domain expertise.
#
# .. your code here ..
In [17]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 2)
user1 = pd.concat([user1.TowerLon, user1.TowerLat], axis = 1)
labels = kmeans.fit_predict(user1)
centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)
showandtell() # TODO: Comment this line out when you're ready to proceed
In [ ]: