In [40]:
pd.DataFrame.plot?

In [39]:
matplotlib?

In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot') # Look Pretty

In [36]:
# This dataset has call records for 10 users tracked over the course of 3 years. 
df = pd.read_csv('data/CDR_data.csv')
df.head()


Out[36]:
In Out Direction CallTimestamp Duration TowerID
0 4638472273 2666307251 Incoming 2010-12-25 07:16:24.736813 0:02:41.741499 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec
1 4638472273 1755442610 Incoming 2010-12-25 21:18:30.053710 0:02:47.108750 aeaf8b43-8034-44fe-833d-31854a75acbf
2 4638472273 5481755331 Incoming 2010-12-25 14:52:42.878016 0:04:35.356341 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
3 4638472273 1755442610 Incoming 2010-12-25 16:02:09.001913 0:02:23.498499 fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
4 4638472273 2145623768 Incoming 2010-12-25 15:28:35.028554 0:03:54.692497 95d7920d-c3cd-4d20-a568-9a55800dc807

In [34]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 6 columns):
In               53188 non-null int64
Out              53188 non-null int64
Direction        53188 non-null object
CallTimestamp    53188 non-null object
Duration         53188 non-null object
TowerID          53188 non-null object
dtypes: int64(2), object(4)
memory usage: 2.4+ MB

In [24]:
df.CallTimestamp = pd.to_datetime(df.CallTimestamp)
df.Duration = pd.to_timedelta(df.Duration)
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 6 columns):
In               53188 non-null int64
Out              53188 non-null int64
Direction        53188 non-null object
CallTimestamp    53188 non-null datetime64[ns]
Duration         53188 non-null timedelta64[ns]
TowerID          53188 non-null object
dtypes: datetime64[ns](1), int64(2), object(2), timedelta64[ns](1)
memory usage: 2.4+ MB

In [30]:
phoneowners = pd.read_csv("data/phoneowners.xlsx", dtype={1:str})
phoneowners.head()


Out[30]:
name number
0 Philip Morales 03789882060
1 Julie Turner 08431579775
2 Andre Allen 09094831840
3 Nicole Duncan 00715852225
4 Matthew Nelson 00672745472

In [28]:
phoneowners.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
name      999 non-null object
number    999 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB

In [27]:
phoneowners[phoneowners.name == "John Doe"]


Out[27]:
name number
498 John Doe 4638472273

In [7]:
suspect_data = df[df.In = ]

In [18]:
# INFO: Plot all the call locations
%matplotlib inline
user1.plot.scatter(x='TowerLon', y='TowerLat', c='purple', alpha=0.12, title='Call Locations', s = 30)
#showandtell()  # Comment this line out when you're ready to proceed


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1e78d262d68>

In [15]:
#
# INFO: The locations map above should be too "busy" to really wrap your head around. This is where domain expertise comes into play.
# Your intuition tells you that people are likely to behave differently on weekends:
#
# On Weekends:
#   1. People probably don't go into work
#   2. They probably sleep in late on Saturday
#   3. They probably run a bunch of random errands, since they couldn't during the week
#   4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
#   1. People probably are at work during normal working hours
#   2. They probably are at home in the early morning and during the late night
#   3. They probably spend time commuting between work and home everyday


#
# TODO: Add more filters to the user1 slice you created. Add bitwise logic so that you're only examining records that came in on
# weekends (sat/sun).
#
# .. your code here ..
user1 = user1[(user1.DOW == 'Sat') | (user1.DOW == 'Sun')]

#
# TODO: Further filter it down for calls that are came in either before 6AM OR after 10pm (22:00:00). You can use < and > to compare
# the string times, just make sure you code them as military time strings, eg: "06:00:00", "22:00:00":
# https://en.wikipedia.org/wiki/24-hour_clock
#
# You might also want to review the Data Manipulation section for this. Once you have your filtered slice, print out its length:
#
# .. your code here ..
user1 = user1[(user1.CallTime < "06:00:00") | (user1.CallTime > "22:00:00")]
user1.head()


Out[15]:
In Out Direction CallDate CallTime DOW Duration TowerID TowerLat TowerLon
940 4638472273 6150928001 Incoming 2011-01-08 22:22:33.693878 Sat 00:00:35.065912 26f7441e-e64b-4b9d-8c5a-0eb34f7e919f 32.731611 -96.709417
1875 4638472273 2145623768 Incoming 2011-01-22 22:21:27.466908 Sat 00:03:40.018320 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
6949 4638472273 5621270003 Missed 2011-04-09 22:59:23.206408 Sat 00:03:06.567364 2567c73b-3711-4a31-8683-9d12d56857d8 32.731611 -96.709417
7468 4638472273 7756914135 Incoming 2011-04-17 05:15:40.205917 Sun 00:02:09.984993 cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc 32.731722 -96.709500
9753 4638472273 7922223441 Incoming 2011-05-21 05:08:20.121145 Sat 00:01:30.059591 0db53dd3-eb9c-4344-abc5-c2d74ebc3eec 32.731611 -96.709417

In [19]:
#
# INFO: Visualize the dataframe with a scatter plot as a sanity check. Since you're familiar with maps, you know well that your
# X-Coordinate should be Longitude, and your Y coordinate should be the tower Latitude. Check the dataset headers for proper column
# feature names.
# https://en.wikipedia.org/wiki/Geographic_coordinate_system#Geographic_latitude_and_longitude
#
# At this point, you don't yet know exactly where the user is located just based off the cell phone tower position data; but
# considering the below are for Calls that arrived in the twilight hours of weekends, it's likely that wherever they are bunched up
# is probably near the caller's residence:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
showandtell()  # TODO: Comment this line out when you're ready to proceed

#
# TODO: Run K-Means with a K=1. There really should only be a single area of concentration. If you notice multiple areas that are
# "hot" (multiple areas the usr spends a lot of time at that are FAR apart from one another), then increase K=2, with the goal being
# that one of the centroids will sweep up the annoying outliers; and the other will zero in on the user's approximate home location.
# Or rather the location of the cell tower closest to their home.....
#
# Be sure to only feed in Lat and Lon coordinates to the KMeans algo, since none of the other data is suitable for your purposes.
# Since both Lat and Lon are (approximately) on the same scale, no feature scaling is required. Print out the centroid locations and
# add them onto your scatter plot. Use a distinguishable marker and color.
#
# Hint: Make sure you graph the CORRECT coordinates. This is part of your domain expertise.
#
# .. your code here ..



In [17]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 2)
user1 = pd.concat([user1.TowerLon, user1.TowerLat], axis = 1)
labels = kmeans.fit_predict(user1)
centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)

showandtell()  # TODO: Comment this line out when you're ready to proceed

In [ ]: