notebook.community

Edit and run



In [40]:

    
pd.DataFrame.plot?



In [39]:

    
matplotlib?



In [20]:

    
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
matplotlib.style.use('ggplot') # Look Pretty



In [36]:

    
# This dataset has call records for 10 users tracked over the course of 3 years. 
df = pd.read_csv('data/CDR_data.csv')
df.head()









    Out[36]:







  
    
      
      In
      Out
      Direction
      CallTimestamp
      Duration
      TowerID
    
  
  
    
      0
      4638472273
      2666307251
      Incoming
      2010-12-25 07:16:24.736813
      0:02:41.741499
      0db53dd3-eb9c-4344-abc5-c2d74ebc3eec
    
    
      1
      4638472273
      1755442610
      Incoming
      2010-12-25 21:18:30.053710
      0:02:47.108750
      aeaf8b43-8034-44fe-833d-31854a75acbf
    
    
      2
      4638472273
      5481755331
      Incoming
      2010-12-25 14:52:42.878016
      0:04:35.356341
      fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
    
    
      3
      4638472273
      1755442610
      Incoming
      2010-12-25 16:02:09.001913
      0:02:23.498499
      fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
    
    
      4
      4638472273
      2145623768
      Incoming
      2010-12-25 15:28:35.028554
      0:03:54.692497
      95d7920d-c3cd-4d20-a568-9a55800dc807



In [34]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 6 columns):
In               53188 non-null int64
Out              53188 non-null int64
Direction        53188 non-null object
CallTimestamp    53188 non-null object
Duration         53188 non-null object
TowerID          53188 non-null object
dtypes: int64(2), object(4)
memory usage: 2.4+ MB



In [24]:

    
df.CallTimestamp = pd.to_datetime(df.CallTimestamp)
df.Duration = pd.to_timedelta(df.Duration)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53188 entries, 0 to 53187
Data columns (total 6 columns):
In               53188 non-null int64
Out              53188 non-null int64
Direction        53188 non-null object
CallTimestamp    53188 non-null datetime64[ns]
Duration         53188 non-null timedelta64[ns]
TowerID          53188 non-null object
dtypes: datetime64[ns](1), int64(2), object(2), timedelta64[ns](1)
memory usage: 2.4+ MB



In [30]:

    
phoneowners = pd.read_csv("data/phoneowners.xlsx", dtype={1:str})
phoneowners.head()









    Out[30]:







  
    
      
      name
      number
    
  
  
    
      0
      Philip Morales
      03789882060
    
    
      1
      Julie Turner
      08431579775
    
    
      2
      Andre Allen
      09094831840
    
    
      3
      Nicole Duncan
      00715852225
    
    
      4
      Matthew Nelson
      00672745472



In [28]:

    
phoneowners.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 999 entries, 0 to 998
Data columns (total 2 columns):
name      999 non-null object
number    999 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.7+ KB



In [27]:

    
phoneowners[phoneowners.name == "John Doe"]



In [7]:

    
suspect_data = df[df.In = ]



In [18]:

    
# INFO: Plot all the call locations
%matplotlib inline
user1.plot.scatter(x='TowerLon', y='TowerLat', c='purple', alpha=0.12, title='Call Locations', s = 30)
#showandtell()  # Comment this line out when you're ready to proceed









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x1e78d262d68>



In [15]:

    
#
# INFO: The locations map above should be too "busy" to really wrap your head around. This is where domain expertise comes into play.
# Your intuition tells you that people are likely to behave differently on weekends:
#
# On Weekends:
#   1. People probably don't go into work
#   2. They probably sleep in late on Saturday
#   3. They probably run a bunch of random errands, since they couldn't during the week
#   4. They should be home, at least during the very late hours, e.g. 1-4 AM
#
# On Weekdays:
#   1. People probably are at work during normal working hours
#   2. They probably are at home in the early morning and during the late night
#   3. They probably spend time commuting between work and home everyday


#
# TODO: Add more filters to the user1 slice you created. Add bitwise logic so that you're only examining records that came in on
# weekends (sat/sun).
#
# .. your code here ..
user1 = user1[(user1.DOW == 'Sat') | (user1.DOW == 'Sun')]

#
# TODO: Further filter it down for calls that are came in either before 6AM OR after 10pm (22:00:00). You can use < and > to compare
# the string times, just make sure you code them as military time strings, eg: "06:00:00", "22:00:00":
# https://en.wikipedia.org/wiki/24-hour_clock
#
# You might also want to review the Data Manipulation section for this. Once you have your filtered slice, print out its length:
#
# .. your code here ..
user1 = user1[(user1.CallTime < "06:00:00") | (user1.CallTime > "22:00:00")]
user1.head()









    Out[15]:







  
    
      
      In
      Out
      Direction
      CallDate
      CallTime
      DOW
      Duration
      TowerID
      TowerLat
      TowerLon
    
  
  
    
      940
      4638472273
      6150928001
      Incoming
      2011-01-08
      22:22:33.693878
      Sat
      00:00:35.065912
      26f7441e-e64b-4b9d-8c5a-0eb34f7e919f
      32.731611
      -96.709417
    
    
      1875
      4638472273
      2145623768
      Incoming
      2011-01-22
      22:21:27.466908
      Sat
      00:03:40.018320
      2567c73b-3711-4a31-8683-9d12d56857d8
      32.731611
      -96.709417
    
    
      6949
      4638472273
      5621270003
      Missed
      2011-04-09
      22:59:23.206408
      Sat
      00:03:06.567364
      2567c73b-3711-4a31-8683-9d12d56857d8
      32.731611
      -96.709417
    
    
      7468
      4638472273
      7756914135
      Incoming
      2011-04-17
      05:15:40.205917
      Sun
      00:02:09.984993
      cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc
      32.731722
      -96.709500
    
    
      9753
      4638472273
      7922223441
      Incoming
      2011-05-21
      05:08:20.121145
      Sat
      00:01:30.059591
      0db53dd3-eb9c-4344-abc5-c2d74ebc3eec
      32.731611
      -96.709417



In [19]:

    
#
# INFO: Visualize the dataframe with a scatter plot as a sanity check. Since you're familiar with maps, you know well that your
# X-Coordinate should be Longitude, and your Y coordinate should be the tower Latitude. Check the dataset headers for proper column
# feature names.
# https://en.wikipedia.org/wiki/Geographic_coordinate_system#Geographic_latitude_and_longitude
#
# At this point, you don't yet know exactly where the user is located just based off the cell phone tower position data; but
# considering the below are for Calls that arrived in the twilight hours of weekends, it's likely that wherever they are bunched up
# is probably near the caller's residence:
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(user1.TowerLon,user1.TowerLat, c='g', marker='o', alpha=0.2)
ax.set_title('Weekend Calls (<6am or >10p)')
showandtell()  # TODO: Comment this line out when you're ready to proceed

#
# TODO: Run K-Means with a K=1. There really should only be a single area of concentration. If you notice multiple areas that are
# "hot" (multiple areas the usr spends a lot of time at that are FAR apart from one another), then increase K=2, with the goal being
# that one of the centroids will sweep up the annoying outliers; and the other will zero in on the user's approximate home location.
# Or rather the location of the cell tower closest to their home.....
#
# Be sure to only feed in Lat and Lon coordinates to the KMeans algo, since none of the other data is suitable for your purposes.
# Since both Lat and Lon are (approximately) on the same scale, no feature scaling is required. Print out the centroid locations and
# add them onto your scatter plot. Use a distinguishable marker and color.
#
# Hint: Make sure you graph the CORRECT coordinates. This is part of your domain expertise.
#
# .. your code here ..



In [17]:

    
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 2)
user1 = pd.concat([user1.TowerLon, user1.TowerLat], axis = 1)
labels = kmeans.fit_predict(user1)
centroids = kmeans.cluster_centers_
ax.scatter(x = centroids[:, 0], y = centroids[:, 1], c = 'r', marker = 'x', s = 100)

showandtell()  # TODO: Comment this line out when you're ready to proceed



In [ ]:

	In	Out	Direction	CallTimestamp	Duration	TowerID
0	4638472273	2666307251	Incoming	2010-12-25 07:16:24.736813	0:02:41.741499	0db53dd3-eb9c-4344-abc5-c2d74ebc3eec
1	4638472273	1755442610	Incoming	2010-12-25 21:18:30.053710	0:02:47.108750	aeaf8b43-8034-44fe-833d-31854a75acbf
2	4638472273	5481755331	Incoming	2010-12-25 14:52:42.878016	0:04:35.356341	fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
3	4638472273	1755442610	Incoming	2010-12-25 16:02:09.001913	0:02:23.498499	fadaa83f-6001-45fd-aa4a-17d6c6b7ec00
4	4638472273	2145623768	Incoming	2010-12-25 15:28:35.028554	0:03:54.692497	95d7920d-c3cd-4d20-a568-9a55800dc807

	name	number
0	Philip Morales	03789882060
1	Julie Turner	08431579775
2	Andre Allen	09094831840
3	Nicole Duncan	00715852225
4	Matthew Nelson	00672745472

	In	Out	Direction	CallDate	CallTime	DOW	Duration	TowerID	TowerLat	TowerLon
940	4638472273	6150928001	Incoming	2011-01-08	22:22:33.693878	Sat	00:00:35.065912	26f7441e-e64b-4b9d-8c5a-0eb34f7e919f	32.731611	-96.709417
1875	4638472273	2145623768	Incoming	2011-01-22	22:21:27.466908	Sat	00:03:40.018320	2567c73b-3711-4a31-8683-9d12d56857d8	32.731611	-96.709417
6949	4638472273	5621270003	Missed	2011-04-09	22:59:23.206408	Sat	00:03:06.567364	2567c73b-3711-4a31-8683-9d12d56857d8	32.731611	-96.709417
7468	4638472273	7756914135	Incoming	2011-04-17	05:15:40.205917	Sun	00:02:09.984993	cd9f3b1a-2eb8-4cdb-86d1-5d4c2740b1dc	32.731722	-96.709500
9753	4638472273	7922223441	Incoming	2011-05-21	05:08:20.121145	Sat	00:01:30.059591	0db53dd3-eb9c-4344-abc5-c2d74ebc3eec	32.731611	-96.709417