In [8]:

    
# Import stuff

import rpy2.interactive 
import rpy2.interactive.packages
%load_ext rpy2.ipython

# Directly convert objects from pandas to r and vsv
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#Set the global figure size
plt.rcParams['figure.figsize'] = (8.0, 8.0)









    



The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython



In [7]:

    
%%R
# Load R libaries
library(ggplot2)
library(dplyr)



In [2]:

    
%ls









    



correlation_images.jpeg  lesson4_student.rmd  pseudo_facebook.tsv
Lesson 4.ipynb           Mitchell.csv



In [ ]:

    
#let's move the correct data file over from lesson 3 and use some bash knowledge
!cp  Data-Science/dand/exploratory_data_analysis/lesson3/pseudo_facebook.tsv
     Data-Science/dand/exploratory_data_analysis/lesson4/
# I dit it in bash



In [3]:

    
%ls #yep









    



correlation_images.jpeg  lesson4_student.rmd
Lesson 4.ipynb           pseudo_facebook.tsv

Load and summarize

In R



In [3]:

    
%%R
#load it in R
pf <-read.csv('pseudo_facebook.tsv', sep='\t')
head(pf)









    





   userid age dob_day dob_year dob_month gender tenure friend_count
1 2094382  14      19     1999        11   male    266            0
2 1192601  14       2     1999        11 female      6            0
3 2083884  14      16     1999        11   male     13            0
4 1203168  14      25     1999        12 female     93            0
5 1733186  14       4     1999        12   male     82            0
6 1524765  14       1     1999        12   male     15            0
  friendships_initiated likes likes_received mobile_likes mobile_likes_received
1                     0     0              0            0                     0
2                     0     0              0            0                     0
3                     0     0              0            0                     0
4                     0     0              0            0                     0
5                     0     0              0            0                     0
6                     0     0              0            0                     0
  www_likes www_likes_received
1         0                  0
2         0                  0
3         0                  0
4         0                  0
5         0                  0
6         0                  0



In [6]:

    
%R names(pf)









    Out[6]:





array(['userid', 'age', 'dob_day', 'dob_year', 'dob_month', 'gender',
       'tenure', 'friend_count', 'friendships_initiated', 'likes',
       'likes_received', 'mobile_likes', 'mobile_likes_received',
       'www_likes', 'www_likes_received'], 
      dtype='<U21')



In [8]:

    
%%R
summary(pf)









    





     userid             age            dob_day         dob_year   
 Min.   :1000008   Min.   : 13.00   Min.   : 1.00   Min.   :1900  
 1st Qu.:1298806   1st Qu.: 20.00   1st Qu.: 7.00   1st Qu.:1963  
 Median :1596148   Median : 28.00   Median :14.00   Median :1985  
 Mean   :1597045   Mean   : 37.28   Mean   :14.53   Mean   :1976  
 3rd Qu.:1895744   3rd Qu.: 50.00   3rd Qu.:22.00   3rd Qu.:1993  
 Max.   :2193542   Max.   :113.00   Max.   :31.00   Max.   :2000  
                                                                  
   dob_month         gender          tenure        friend_count   
 Min.   : 1.000   female:40254   Min.   :   0.0   Min.   :   0.0  
 1st Qu.: 3.000   male  :58574   1st Qu.: 226.0   1st Qu.:  31.0  
 Median : 6.000   NA's  :  175   Median : 412.0   Median :  82.0  
 Mean   : 6.283                  Mean   : 537.9   Mean   : 196.4  
 3rd Qu.: 9.000                  3rd Qu.: 675.0   3rd Qu.: 206.0  
 Max.   :12.000                  Max.   :3139.0   Max.   :4923.0  
                                 NA's   :2                        
 friendships_initiated     likes         likes_received      mobile_likes    
 Min.   :   0.0        Min.   :    0.0   Min.   :     0.0   Min.   :    0.0  
 1st Qu.:  17.0        1st Qu.:    1.0   1st Qu.:     1.0   1st Qu.:    0.0  
 Median :  46.0        Median :   11.0   Median :     8.0   Median :    4.0  
 Mean   : 107.5        Mean   :  156.1   Mean   :   142.7   Mean   :  106.1  
 3rd Qu.: 117.0        3rd Qu.:   81.0   3rd Qu.:    59.0   3rd Qu.:   46.0  
 Max.   :4144.0        Max.   :25111.0   Max.   :261197.0   Max.   :25111.0  
                                                                             
 mobile_likes_received   www_likes        www_likes_received 
 Min.   :     0.00     Min.   :    0.00   Min.   :     0.00  
 1st Qu.:     0.00     1st Qu.:    0.00   1st Qu.:     0.00  
 Median :     4.00     Median :    0.00   Median :     2.00  
 Mean   :    84.12     Mean   :   49.96   Mean   :    58.57  
 3rd Qu.:    33.00     3rd Qu.:    7.00   3rd Qu.:    20.00  
 Max.   :138561.00     Max.   :14865.00   Max.   :129953.00



In [9]:

    
%%R
str(pf)









    





'data.frame':	99003 obs. of  15 variables:
 $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
 $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
 $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
 $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
 $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
 $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
 $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
 $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
 $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
 $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
 $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
 $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
 $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...



In [4]:

    
pf = pd.read_csv('pseudo_facebook.tsv', sep='\t')
pf.head()









    Out[4]:






  
    
      
      userid
      age
      dob_day
      dob_year
      dob_month
      gender
      tenure
      friend_count
      friendships_initiated
      likes
      likes_received
      mobile_likes
      mobile_likes_received
      www_likes
      www_likes_received
    
  
  
    
      0
      2094382
      14
      19
      1999
      11
      male
      266.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      1
      1192601
      14
      2
      1999
      11
      female
      6.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      2
      2083884
      14
      16
      1999
      11
      male
      13.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      3
      1203168
      14
      25
      1999
      12
      female
      93.0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      4
      1733186
      14
      4
      1999
      12
      male
      82.0
      0
      0
      0
      0
      0
      0
      0
      0



In [18]:

    
pf.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99003 entries, 0 to 99002
Data columns (total 15 columns):
userid                   99003 non-null int64
age                      99003 non-null int64
dob_day                  99003 non-null int64
dob_year                 99003 non-null int64
dob_month                99003 non-null int64
gender                   98828 non-null object
tenure                   99001 non-null float64
friend_count             99003 non-null int64
friendships_initiated    99003 non-null int64
likes                    99003 non-null int64
likes_received           99003 non-null int64
mobile_likes             99003 non-null int64
mobile_likes_received    99003 non-null int64
www_likes                99003 non-null int64
www_likes_received       99003 non-null int64
dtypes: float64(1), int64(13), object(1)
memory usage: 11.3+ MB



In [5]:

    
#Make gender categorical
pf['gender'] = pf['gender'].astype('category')



In [20]:

    
pf.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99003 entries, 0 to 99002
Data columns (total 15 columns):
userid                   99003 non-null int64
age                      99003 non-null int64
dob_day                  99003 non-null int64
dob_year                 99003 non-null int64
dob_month                99003 non-null int64
gender                   98828 non-null category
tenure                   99001 non-null float64
friend_count             99003 non-null int64
friendships_initiated    99003 non-null int64
likes                    99003 non-null int64
likes_received           99003 non-null int64
mobile_likes             99003 non-null int64
mobile_likes_received    99003 non-null int64
www_likes                99003 non-null int64
www_likes_received       99003 non-null int64
dtypes: category(1), float64(1), int64(13)
memory usage: 10.7 MB



In [21]:

    
pf.gender









    Out[21]:





0          male
1        female
2          male
3        female
4          male
5          male
6          male
7        female
8          male
9          male
10         male
11         male
12         male
13         male
14       female
15       female
16       female
17       female
18       female
19         male
20       female
21       female
22         male
23         male
24         male
25         male
26         male
27         male
28         male
29         male
          ...  
98973    female
98974      male
98975      male
98976    female
98977    female
98978    female
98979    female
98980    female
98981    female
98982    female
98983    female
98984    female
98985      male
98986    female
98987    female
98988      male
98989    female
98990      male
98991    female
98992    female
98993      male
98994    female
98995    female
98996    female
98997    female
98998    female
98999    female
99000    female
99001    female
99002    female
Name: gender, dtype: category
Categories (2, object): [female, male]

Try to cast the numerical codes to categories like in R



In [14]:

    
pf['gender'] = pf['gender'].cat.codes



In [15]:

    
pf.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99003 entries, 0 to 99002
Data columns (total 15 columns):
userid                   99003 non-null int64
age                      99003 non-null int64
dob_day                  99003 non-null int64
dob_year                 99003 non-null int64
dob_month                99003 non-null int64
gender                   99003 non-null int8
tenure                   99001 non-null float64
friend_count             99003 non-null int64
friendships_initiated    99003 non-null int64
likes                    99003 non-null int64
likes_received           99003 non-null int64
mobile_likes             99003 non-null int64
mobile_likes_received    99003 non-null int64
www_likes                99003 non-null int64
www_likes_received       99003 non-null int64
dtypes: float64(1), int64(13), int8(1)
memory usage: 10.7 MB



In [16]:

    
pf.gender # it turrns them into numbers but without retainning the string information









    Out[16]:





0        1
1        0
2        1
3        0
4        1
5        1
6        1
7        0
8        1
9        1
10       1
11       1
12       1
13       1
14       0
15       0
16       0
17       0
18       0
19       1
20       0
21       0
22       1
23       1
24       1
25       1
26       1
27       1
28       1
29       1
        ..
98973    0
98974    1
98975    1
98976    0
98977    0
98978    0
98979    0
98980    0
98981    0
98982    0
98983    0
98984    0
98985    1
98986    0
98987    0
98988    1
98989    0
98990    1
98991    0
98992    0
98993    1
98994    0
98995    0
98996    0
98997    0
98998    0
98999    0
99000    0
99001    0
99002    0
Name: gender, dtype: int8



In [22]:

    
pf.gender.cat.categories #There they are!









    Out[22]:





Index(['female', 'male'], dtype='object')



In [24]:

    
pf.gender.cat.codes









    Out[24]:





0        1
1        0
2        1
3        0
4        1
5        1
6        1
7        0
8        1
9        1
10       1
11       1
12       1
13       1
14       0
15       0
16       0
17       0
18       0
19       1
20       0
21       0
22       1
23       1
24       1
25       1
26       1
27       1
28       1
29       1
        ..
98973    0
98974    1
98975    1
98976    0
98977    0
98978    0
98979    0
98980    0
98981    0
98982    0
98983    0
98984    0
98985    1
98986    0
98987    0
98988    1
98989    0
98990    1
98991    0
98992    0
98993    1
98994    0
98995    0
98996    0
98997    0
98998    0
98999    0
99000    0
99001    0
99002    0
dtype: int8



In [25]:

    
# This seems to work
pd.factorize(pf.gender)









    Out[25]:





(array([0, 1, 0, ..., 1, 1, 1]), Index(['male', 'female'], dtype='object'))



In [26]:

    
pf['gender'] = pd.factorize(pf.gender) # This does not work because they have different indexes









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-26-518b14a1923f> in <module>()
----> 1 pf['gender'] = pd.factorize(pf.gender)

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   2417         else:
   2418             # set column
-> 2419             self._set_item(key, value)
   2420 
   2421     def _setitem_slice(self, key, value):

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   2483 
   2484         self._ensure_valid_index(value)
-> 2485         value = self._sanitize_column(key, value)
   2486         NDFrame._set_item(self, key, value)
   2487 

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
   2654 
   2655             # turn me into an ndarray
-> 2656             value = _sanitize_index(value, self.index, copy=False)
   2657             if not isinstance(value, (np.ndarray, Index)):
   2658                 if isinstance(value, list) and len(value) > 0:

~/anaconda3/lib/python3.6/site-packages/pandas/core/series.py in _sanitize_index(data, index, copy)
   2798 
   2799     if len(data) != len(index):
-> 2800         raise ValueError('Length of values does not match length of ' 'index')
   2801 
   2802     if isinstance(data, PeriodIndex):

ValueError: Length of values does not match length of index

Great answer here

Make a Scatterplot

In R



In [27]:

    
%%R
qplot(x=age, y=friend_count, data=pf)

Use ggplot syntax to set xlim



In [42]:

    
%%R
#Find the limits
summary(pf$age)









    





   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  13.00   20.00   28.00   37.28   50.00  113.00



In [39]:

    
%%R
ggplot(aes(x=age, y=friend_count), data=pf) + geom_point() +
  xlim(13, 90)

Set Transparency



In [45]:

    
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
  geom_point(alpha = 1/20) +
  xlim(13, 90)

Add Jitter



In [48]:

    
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
  geom_jitter(alpha = 1/20) +
  xlim(13, 90)

Coordinate transformation



In [61]:

    
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
  geom_point(alpha = 1/20) +
  xlim(13, 90) + 
  coord_trans(y = 'sqrt')

Alpha and Jitter Exercise

Examine the relationship between friendships_initiated (y) and age (x) using the ggplot syntax.



In [ ]:

    
%%R
ggplot(aes(x=age, y=friendships_initiated), data=pf) +
  geom_point(alpha = 1/20) +
  xlim(13, 90) + 
  coord_trans(y = 'sqrt')

In Pandas



In [36]:

    
pf.plot.scatter(x='age', y='friend_count', figsize=(8,8));

Three distinct bands of higher fiend counts below 30, at 69 and above 100



In [43]:

    
pf.age.describe()









    Out[43]:





count    99003.000000
mean        37.280224
std         22.589748
min         13.000000
25%         20.000000
50%         28.000000
75%         50.000000
max        113.000000
Name: age, dtype: float64



In [44]:

    
pf.plot.scatter(x='age', y='friend_count', figsize=(8,8))
plt.xlim(13,90);



In [47]:

    
pf.plot.scatter(x='age', y='friend_count',alpha=1/20, figsize=(8,8))
plt.xlim(13,90);

The bulk of friend counts are below 1000



In [72]:

    
ax = sns.swarmplot(x='age', y='friend_count',data=pf, alpha=1/20) # something ain't right here
ax.set_xlim(13,90)


plt.figure(figsize=(8,8)); #It takes a lot of time, does not recognize the xlim and it doesn't finish









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-72-9e233bd86862> in <module>()
      1 
----> 2 ax = sns.swarmplot(x='age', y='friend_count',data=pf, alpha=1/20) # something ain't right here
      3 ax.set_xlim(13,90)
      4 
      5 

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in swarmplot(x, y, hue, data, order, hue_order, split, orient, color, palette, size, edgecolor, linewidth, ax, **kwargs)
   2737                        linewidth=linewidth))
   2738 
-> 2739     plotter.plot(ax, kwargs)
   2740     return ax
   2741 

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in plot(self, ax, kws)
   1395     def plot(self, ax, kws):
   1396         """Make the full plot."""
-> 1397         self.draw_swarmplot(ax, kws)
   1398         self.add_legend_data(ax)
   1399         self.annotate_axes(ax)

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in draw_swarmplot(self, ax, kws)
   1391         for center, swarm in zip(centers, swarms):
   1392             if swarm.get_offsets().size:
-> 1393                 self.swarm_points(ax, swarm, center, width, s, **kws)
   1394 
   1395     def plot(self, ax, kws):

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in swarm_points(self, ax, points, center, width, s, **kws)
   1300 
   1301         # Do the beeswarm in point coordinates
-> 1302         new_xy = self.beeswarm(orig_xy, d)
   1303 
   1304         # Transform the point coordinates back to data coordinates

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in beeswarm(self, orig_xy, d)
   1260             # Remove the positions that overlap with any of the
   1261             # other neighbors
-> 1262             candidates = self.prune_candidates(candidates, neighbors, d)
   1263 
   1264             # Find the most central of the remaining positions

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in prune_candidates(self, candidates, neighbors, d)
   1232             good_candidate = True
   1233             for xy_j in neighbors:
-> 1234                 if self.overlap(xy_i, xy_j, d):
   1235                     good_candidate = False
   1236             if good_candidate:

~/anaconda3/lib/python3.6/site-packages/seaborn/categorical.py in overlap(self, xy_i, xy_j, d)
   1191         """Return True if two circles with the same diameter will overlap."""
   1192         x_i, y_i = xy_i
-> 1193         x_j, y_j = xy_j
   1194         return ((x_i - x_j) ** 2 + (y_i - y_j) ** 2) < (d ** 2)
   1195 

KeyboardInterrupt:

A few useful thoughts about Jitter here: https://github.com/matplotlib/matplotlib/issues/2750



In [70]:

    
pf['sq_friend_count']  = np.sqrt(pf.friend_count)
ax = pf.plot.scatter(x='age', y='sq_friend_count', alpha=1/20, figsize=(8,8))
plt.xlim(13,90); # The Y 

# There does not seem to exist a equivalennt here although the result is quite interesting
# You probably have to do it by hand









    Out[70]:





(7.994489975861895,
 118.00551002413813,
 -0.36203723144491262,
 89.863425977652682)

Conditional Means



In [31]:

    
%%R

age_groups <- group_by(pf, age)

pf.fc_by_age <- summarise(age_groups,
                          friend_count_mean = mean(friend_count),
                          friend_count_median = median(friend_count),
                          n = n())

pf.fc_by_age <- arrange(pf.fc_by_age, age)

head(pf.fc_by_age)









    





# A tibble: 6 × 4
    age friend_count_mean friend_count_median     n
  <int>             <dbl>               <dbl> <int>
1    13          164.7500                74.0   484
2    14          251.3901               132.0  1925
3    15          347.6921               161.0  2618
4    16          351.9371               171.5  3086
5    17          350.3006               156.0  3283
6    18          331.1663               162.0  5196



In [98]:

    
%%R
## Alternative
pf.fc_by_age <- pf %>%
  group_by(age) %>%
  summarise(friend_count_mean = mean(friend_count),
            friend_count_median = median(friend_count),
            n = n()) %>%
  arrange(age)

head(pf.fc_by_age)









    





# A tibble: 6 × 4
    age friend_count_mean friend_count_median     n
  <int>             <dbl>               <dbl> <int>
1    13          164.7500                74.0   484
2    14          251.3901               132.0  1925
3    15          347.6921               161.0  2618
4    16          351.9371               171.5  3086
5    17          350.3006               156.0  3283
6    18          331.1663               162.0  5196



In [52]:

    
grouped = pf.groupby('age')
pf_c_by_age = grouped['friend_count'].agg([np.mean, np.median, len])

# pf_c_by_age = pd.DataFrame([pf_c_by_age.index, pf_c_by_age.mean, pf_c_by_age.median, pf_c_by_age.len],
#                            columns=['age','friend_count_mean', 'friend_count_median', 'n'])
                    
pf_c_by_age = pd.DataFrame({'age': pf_c_by_age.index,
                            'friend_count_mean': pf_c_by_age['mean'],
                            'friend_count_median': pf_c_by_age['median'],
                            'n': pf_c_by_age['len']}).reset_index(drop=True) # this one will ake the index and make it a column if drop=False

pf_c_by_age.sort_values('age')
pf_c_by_age.head()









    Out[52]:






  
    
      
      age
      friend_count_mean
      friend_count_median
      n
    
  
  
    
      0
      13
      164.750000
      74.0
      484
    
    
      1
      14
      251.390130
      132.0
      1925
    
    
      2
      15
      347.692131
      161.0
      2618
    
    
      3
      16
      351.937135
      171.5
      3086
    
    
      4
      17
      350.300640
      156.0
      3283

Plot the conditional means



In [99]:

    
%%R
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
  geom_point(alpha = 1/1) +
  xlim(13, 90)



In [103]:

    
pf_c_by_age.plot.scatter(x='age', y ='friend_count_mean');



In [100]:

    
%%R
# With lines
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
  geom_line(alpha = 1/1) +
  xlim(13, 90)



In [104]:

    
pf_c_by_age.plot.line(x='age', y ='friend_count_mean');

Overlaying Summaries with Raw Data



In [105]:

    
%%R
ggplot(aes(x = age, y=friend_count), data=pf) + 
  xlim(13,90) + 
  geom_point(alpha=0.05,
             position=position_jitter(h=0),
             color='orange') + 
  coord_trans(y='sqrt') + 
  geom_line(stat='summary', fun.y=mean) +
  geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .1),
            linetype=2, color='blue') +
  geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .5),
             color='blue') +
   geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .9),
            linetype=2, color='blue')



In [107]:

    
# Try to get the sqrt scale
import matplotlib.scale as mscale
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import matplotlib.ticker as ticker
import numpy as np

class SquareRootScale(mscale.ScaleBase):
    """
    ScaleBase class for generating square root scale.
    """

    name = 'squareroot'

    def __init__(self, axis, **kwargs):
        mscale.ScaleBase.__init__(self)

    def set_default_locators_and_formatters(self, axis):
        axis.set_major_locator(ticker.AutoLocator())
        axis.set_major_formatter(ticker.ScalarFormatter())
        axis.set_minor_locator(ticker.NullLocator())
        axis.set_minor_formatter(ticker.NullFormatter())

    def limit_range_for_scale(self, vmin, vmax, minpos):
        return  max(0., vmin), vmax

    class SquareRootTransform(mtransforms.Transform):
        input_dims = 1
        output_dims = 1
        is_separable = True

        def transform_non_affine(self, a): 
            return np.array(a)**0.5

        def inverted(self):
            return SquareRootScale.InvertedSquareRootTransform()

    class InvertedSquareRootTransform(mtransforms.Transform):
        input_dims = 1
        output_dims = 1
        is_separable = True

        def transform(self, a):
            return np.array(a)**2

        def inverted(self):
            return SquareRootScale.SquareRootTransform()

    def get_transform(self):
        return self.SquareRootTransform()

mscale.register_scale(SquareRootScale)



In [109]:

    
fig, ax = plt.subplots(1)

ax.set_yscale('squareroot')

pf.plot.scatter(x='age', y='friend_count', alpha=1/20, ax=ax)
plt.xlim(13,90)
# Doesn't work









    Out[109]:





(13, 90)



In [161]:

    
fig, ax = plt.subplots(1)

ax.set_yscale('symlog')

pf.plot.scatter(x='age', y='friend_count', alpha=1/20,color='orange', ax=ax)

pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='blue', ax=ax)

#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='blue',style='--', ax=ax)

plt.xlim(13,90);
plt.ylim(ymin=1);



In [162]:

    
# Without the axis scaling
fig, ax = plt.subplots(1)

# ax.set_yscale('symlog')

pf.plot.scatter(x='age', y='friend_count', alpha=1/20,color='orange', ax=ax)

pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='blue', ax=ax)

#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='blue',style='--', ax=ax)

plt.xlim(13,90);
# plt.ylim(ymin=1);



In [166]:

    
# zooming in
fig, ax = plt.subplots(1)

# ax.set_yscale('symlog')

pf.plot.scatter(x='age', y='friend_count', alpha=1/10,color='orange', ax=ax)

pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='black', ax=ax)

#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='red',style='--', ax=ax)

plt.xlim(13,90)
plt.ylim(0,1000);

I am still missing the upper part of the y-range since I cannot scale the axis in python in sqrt the same way

Correlation



In [168]:

    
%%R
library(stats)
cor.test(pf$age, pf$friend_count)









    





	Pearson's product-moment correlation

data:  pf$age and pf$friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.03363072 -0.02118189
sample estimates:
        cor 
-0.02740737



In [169]:

    
%%R
# Alternative
with(pf, cor.test(age, friend_count))









    





	Pearson's product-moment correlation

data:  age and friend_count
t = -8.6268, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.03363072 -0.02118189
sample estimates:
        cor 
-0.02740737



In [172]:

    
pf[['age','friend_count']].corr()









    Out[172]:






  
    
      
      age
      friend_count
    
  
  
    
      age
      1.000000
      -0.027407
    
    
      friend_count
      -0.027407
      1.000000



In [173]:

    
pf.age.corr(pf.friend_count)









    Out[173]:





-0.02740737154230408



In [174]:

    
np.corrcoef(pf.age, pf.friend_count)









    Out[174]:





array([[ 1.        , -0.02740737],
       [-0.02740737,  1.        ]])



In [175]:

    
np.correlate(pf.age, pf.friend_count) ##??









    Out[175]:





array([700962402])

Correlation on Subsets



In [176]:

    
%%R
with(subset(pf, age<=70), cor.test(age, friend_count))









    





	Pearson's product-moment correlation

data:  age and friend_count
t = -52.592, df = 91029, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.1780220 -0.1654129
sample estimates:
       cor 
-0.1717245



In [178]:

    
pf.age[pf.age<=70].corr(pf.friend_count)









    Out[178]:





-0.17172448270245838

Correlation Methods



In [179]:

    
%%R
with(subset(pf, age<=70), cor.test(age, friend_count, method = 'spearman'))









    





	Spearman's rank correlation rho

data:  age and friend_count
S = 1.5782e+14, p-value < 2.2e-16
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
-0.2552934



In [180]:

    
pf.age[pf.age<=70].corr(pf.friend_count, method='spearman')









    Out[180]:





-0.25529339549271318

Scatterplot Exercise



In [182]:

    
%%R
ggplot(aes(x = www_likes_received, y=likes_received), data=pf) +  
  geom_point()



In [185]:

    
pf.plot.scatter(x = 'www_likes_received', y='likes_received');

Strong Correlations



In [207]:

    
%%R
ggplot(aes(x=www_likes_received, y=likes_received), data=pf) + 
  geom_point() + 
  xlim(0, quantile(pf$www_likes_received, 0.95)) + 
  ylim(0, quantile(pf$likes_received, 0.95)) +
  geom_smooth(method = 'lm', color = 'red')



In [216]:

    
fig, ax = plt.subplots(1)
pf.plot.scatter(x = 'www_likes_received', y='likes_received', ax=ax)
# pf.plot.line(x = 'www_likes_received', y='likes_received', ax=ax, color='red') # Doesn't work
plt.xlim(0, pf.www_likes_received.quantile(.95))
plt.ylim(0, pf.likes_received.quantile(.95));

Use sklearn to fit the linear regression



In [240]:

    
# Have to convert the training X to 2d dimesional array first
X = pf.www_likes_received.values.reshape(len(pf.www_likes_received), 1)
X









    Out[240]:





array([[   0],
       [   0],
       [   0],
       ..., 
       [1092],
       [ 756],
       [2913]])



In [245]:

    
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X, pf.likes_received)

fig, ax = plt.subplots(1)
pf.plot.scatter(x = 'www_likes_received', y='likes_received', ax=ax)
plt.plot(X, reg.predict(X), color='red')
# pf.plot.line(x = 'www_likes_received', y='likes_received', ax=ax, color='red') # Doesn't work
plt.xlim(0, pf.www_likes_received.quantile(.95))
plt.ylim(0, pf.likes_received.quantile(.95));

Seaborn is the faster solution in this case



In [206]:

    
ax = sns.regplot(x = 'www_likes_received', y='likes_received', data=pf, line_kws = {'color': 'red'},
                robust=True, ci=None)
sns.plt.xlim(0, pf.www_likes_received.quantile(.95))
sns.plt.ylim(0, pf.likes_received.quantile(.95));

Trying out ggplot for Python



In [191]:

    
from ggplot import *



In [227]:

    
ggplot(aes(x='www_likes_received', y='likes_received'), data=pf) + \
geom_point() + \
xlim(0, pf.www_likes_received.quantile(.95)) + \
ylim(0, pf.likes_received.quantile(.95)) + \
stat_smooth(color = 'red') ## Yeay!!









    



/home/jkb/anaconda3/lib/python3.6/site-packages/ggplot/stats/stat_smooth.py:77: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  smoothed_data = smoothed_data.sort('x')






    












    Out[227]:





<ggplot: (8776123141005)>

Python-ggplot example:



In [198]:

    
meat_lng = pd.melt(meat[['date', 'beef', 'pork', 'broilers']], id_vars='date')
ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \
    geom_point() + \
    stat_smooth(color='red')









    



/home/jkb/anaconda3/lib/python3.6/site-packages/ggplot/stats/stat_smooth.py:77: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  smoothed_data = smoothed_data.sort('x')






    












    Out[198]:





<ggplot: (-9223363260731274019)>

Correlation Calculation

What's the correlation between the two variables? Include the top 5% of values for the variable in the calculation and round to 3 decimal places.



In [246]:

    
%%R
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))









    





	Pearson's product-moment correlation

data:  www_likes_received and likes_received
t = 937.1, df = 99001, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9473553 0.9486176
sample estimates:
      cor 
0.9479902

Trying Scipy.stats



In [259]:

    
import scipy
c = scipy.stats.pearsonr(pf.www_likes_received, pf.likes_received)[0]
round(c,3)









    Out[259]:





0.94799999999999995

More caution with correlations

Plot a scatter plot from the Mitchell Dataset



In [266]:

    
%ls









    



correlation_images.jpeg  lesson4_student.rmd  pseudo_facebook.tsv
Lesson 4.ipynb           Mitchell.csv



In [9]:

    
%%R
Mitchell = read.csv('Mitchell.csv')
head(Mitchell)









    





  Month     Temp
1     0 -5.18333
2     1 -1.65000
3     2  2.49444
4     3 10.40000
5     4 14.99440
6     5 21.71670



In [10]:

    
mitchell = r.Mitchell



In [271]:

    
%%R
ggplot(aes(x=Month, y=Temp), data=Mitchell) + 
  geom_point()



In [272]:

    
#Let's use the R object just for fun
r.Mitchell.head()



In [275]:

    
r.Mitchell.plot.scatter(x='Month', y='Temp');

Noisy Scatterplots - Coefficiennts

a. Take a guess for the correlation coefficient for the scatterplot. 0.5

b. What is the actual correlation of the two variables? (Round to the thousandths place) 0.057



In [276]:

    
%%R
with(Mitchell, cor(Month, Temp))
with(Mitchell, cor.test(Month, Temp))









    





	Pearson's product-moment correlation

data:  Month and Temp
t = 0.81816, df = 202, p-value = 0.4142
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.08053637  0.19331562
sample estimates:
       cor 
0.05747063



In [278]:

    
scipy.stats.pearsonr(mitchell.Month, mitchell.Temp)









    Out[278]:





(0.057470629450414333, 0.41422797458167238)

Making sense of data - xscale



In [283]:

    
%%R
range(Mitchell$Month)









    





[1]   0 203



In [281]:

    
%%R
ggplot(aes(x=Month, y=Temp), data=Mitchell) + 
  geom_point() + 
  scale_x_continuous(breaks = seq(0, 203, 12))



In [289]:

    
r.Mitchell.Month.describe().loc[['min', 'max']]









    Out[289]:





min      0.0
max    203.0
Name: Month, dtype: float64



In [291]:

    
# Alternaive
r.Mitchell.Month.ptp() # Nope, returns the difference of values









    Out[291]:





203



In [280]:

    
r.Mitchell.plot.scatter(x='Month', y='Temp')
plt.xticks(range(0,203,12));

A New Perspective



In [292]:

    
%%R
#Detect the yearly flunctuation in the data
ggplot(aes(x=(Month%%12),y=Temp), data=Mitchell)+
  geom_point()



In [300]:

    
r.Mitchell.plot.scatter(x='Month', y='Temp', figsize=(24,8))
plt.xticks(range(0,203,12));

Data Visualization Pioneers

John Tukey

William Playfair

William Playfair and the Psychology of Graphs

There are other measures of associations that can detect this. The dcor.ttest() function in the energy package implements a non-parametric test of the independence of two variables. While the Mitchell soil dataset is too coarse to identify a significant dependency between "Month" and "Temp", we can see the difference between dcor.ttest and cor.test through other examples, like the following:

x <- seq(0, 4*pi, pi/20)
y <- cos(x)
qplot(x = x, y = y)
dcor.ttest(x, y)

There is a yearly cyclical pattern here that is obvious if we stretch the plot.

We can also overrlay each yearr and see the yearly pattern like follows:



In [304]:

    
mitchell['monthperyear'] = mitchell.Month%12
mitchell.plot.scatter(x='monthperyear', y='Temp');

Understanding Noise: Age to Age Months

Add a month decimal to the years



In [12]:

    
%%R

pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
head(pf)









    





   userid age dob_day dob_year dob_month gender tenure friend_count
1 2094382  14      19     1999        11   male    266            0
2 1192601  14       2     1999        11 female      6            0
3 2083884  14      16     1999        11   male     13            0
4 1203168  14      25     1999        12 female     93            0
5 1733186  14       4     1999        12   male     82            0
6 1524765  14       1     1999        12   male     15            0
  friendships_initiated likes likes_received mobile_likes mobile_likes_received
1                     0     0              0            0                     0
2                     0     0              0            0                     0
3                     0     0              0            0                     0
4                     0     0              0            0                     0
5                     0     0              0            0                     0
6                     0     0              0            0                     0
  www_likes www_likes_received age_with_months
1         0                  0        14.08333
2         0                  0        14.08333
3         0                  0        14.08333
4         0                  0        14.00000
5         0                  0        14.00000
6         0                  0        14.00000



In [13]:

    
pf['age_with_months'] = pf.age + (12 - pf.dob_month) / 12
pf.head()









    Out[13]:






  
    
      
      userid
      age
      dob_day
      dob_year
      dob_month
      gender
      tenure
      friend_count
      friendships_initiated
      likes
      likes_received
      mobile_likes
      mobile_likes_received
      www_likes
      www_likes_received
      age_with_months
    
  
  
    
      0
      2094382
      14
      19
      1999
      11
      male
      266.0
      0
      0
      0
      0
      0
      0
      0
      0
      14.083333
    
    
      1
      1192601
      14
      2
      1999
      11
      female
      6.0
      0
      0
      0
      0
      0
      0
      0
      0
      14.083333
    
    
      2
      2083884
      14
      16
      1999
      11
      male
      13.0
      0
      0
      0
      0
      0
      0
      0
      0
      14.083333
    
    
      3
      1203168
      14
      25
      1999
      12
      female
      93.0
      0
      0
      0
      0
      0
      0
      0
      0
      14.000000
    
    
      4
      1733186
      14
      4
      1999
      12
      male
      82.0
      0
      0
      0
      0
      0
      0
      0
      0
      14.000000

Age with Months Means



In [ ]:

    
# Create a new data frame called
# pf.fc_by_age_months that contains
# the mean friend count, the median friend
# count, and the number of users in each
# group of age_with_months. The rows of the
# data framed should be arranged in increasing
# order by the age_with_months variable.

# For example, the first two rows of the resulting
# data frame would look something like...

# age_with_months  friend_count_mean	friend_count_median	n
#              13            275.0000                   275 2
#        13.25000            133.2000                   101 11


# See the Instructor Notes for two hints if you get stuck.
# This programming assignment will automatically be graded.



In [15]:

    
%%R

fc_by_age_months_groups <- group_by(pf, age_with_months)

pf.fc_by_age_months <- summarise(fc_by_age_months_groups ,
                          friend_count_mean = mean(friend_count),
                          friend_count_median = median(friend_count),
                          n = n())

pf.fc_by_age_monthse <- arrange(pf.fc_by_age_months)

head(pf.fc_by_age_months)









    





# A tibble: 6 × 4
  age_with_months friend_count_mean friend_count_median     n
            <dbl>             <dbl>               <dbl> <int>
1        13.16667          46.33333                30.5     6
2        13.25000         115.07143                23.5    14
3        13.33333         136.20000                44.0    25
4        13.41667         164.24242                72.0    33
5        13.50000         131.17778                66.0    45
6        13.58333         156.81481                64.0    54



In [17]:

    
fc_by_age_months_grouped = pf.groupby('age_with_months')

fc_by_age_months = fc_by_age_months_grouped.friend_count.aggregate([np.mean, np.median, len])
fc_by_age_months.head()









    Out[17]:






  
    
      
      mean
      median
      len
    
    
      age_with_months
      
      
      
    
  
  
    
      13.166667
      46.333333
      30.5
      6
    
    
      13.250000
      115.071429
      23.5
      14
    
    
      13.333333
      136.200000
      44.0
      25
    
    
      13.416667
      164.242424
      72.0
      33
    
    
      13.500000
      131.177778
      66.0
      45



In [25]:

    
#alternative
fc_by_age_months_grouped = pf.groupby('age_with_months')

fc_by_age_months = fc_by_age_months_grouped.friend_count.aggregate([np.mean, np.median, len])

fc_by_age_months = pd.DataFrame({'age_with_months':fc_by_age_months.index,
                                'friend_count_mean': fc_by_age_months['mean'],
                                'friend_count_median': fc_by_age_months['median'],
                                'n': fc_by_age_months['len']}).reset_index(drop=True)

fc_by_age_months.head()









    Out[25]:






  
    
      
      age_with_months
      friend_count_mean
      friend_count_median
      n
    
  
  
    
      0
      13.166667
      46.333333
      30.5
      6
    
    
      1
      13.250000
      115.071429
      23.5
      14
    
    
      2
      13.333333
      136.200000
      44.0
      25
    
    
      3
      13.416667
      164.242424
      72.0
      33
    
    
      4
      13.500000
      131.177778
      66.0
      45

Noise in Conditional Means



In [26]:

    
%%R
# Create a new line plot showing friend_count_mean versus the new variable,
# age_with_months. Be sure to use the correct data frame (the one you created
# in the last exercise) AND subset the data to investigate users with ages less
# than 71.

ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months<71)) +
  geom_line()



In [29]:

    
fc_by_age_months[fc_by_age_months.age_with_months<71].plot(x = 'age_with_months', y = 'friend_count_mean');

Smoothing Conditional Means



In [32]:

    
%%R
p1 <- ggplot(aes(x = age, y = friend_count_mean),
             data = subset(pf.fc_by_age, age<71)) + 
  geom_line() + 
  geom_smooth()

p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
             data = subset(pf.fc_by_age_months, age_with_months<71)) + 
  geom_line() + 
  geom_smooth()

p3 <- ggplot(aes(x = round(age/5)*5, y = friend_count),
             data = subset(pf, age<71)) +
  geom_line(stat = 'summary', fun.y = mean)

library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)









    



/home/jkb/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: 
Attaching package: ‘gridExtra’


  warnings.warn(x, RRuntimeWarning)
/home/jkb/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: The following object is masked from ‘package:dplyr’:

    combine


  warnings.warn(x, RRuntimeWarning)
/home/jkb/anaconda3/lib/python3.6/site-packages/rpy2/rinterface/__init__.py:186: RRuntimeWarning: `geom_smooth()` using method = 'loess'

  warnings.warn(x, RRuntimeWarning)



In [59]:

    
fig, (ax1,ax2,ax3) = plt.subplots(3,1)

#This is a small hack because i will combine seaborn's regrression line with a plot from plt
fc_by_age_months[fc_by_age_months.age_with_months<71].plot(x = 'age_with_months', y = 'friend_count_mean', ax=ax1);
sns.regplot(x = 'age_with_months', y = 'friend_count_mean', ax=ax1,
           data = fc_by_age_months[fc_by_age_months.age_with_months<71], scatter = False, order=2, color='black')

pf_c_by_age[pf_c_by_age.age<71].plot(x = 'age', y = 'friend_count_mean', ax=ax2);
sns.regplot(x = 'age', y = 'friend_count_mean', ax=ax2,
           data = pf_c_by_age[pf_c_by_age.age<71], scatter = False, order=2, color='black')

pf = pf[pf.age<71]
pf['round_age'] = np.round(pf.age/5)*5
y = pf.groupby('round_age').mean().friend_count
x = pf.groupby('round_age').mean().index
ax3.plot(x,y);

Thing are soooo more complicated if ones tries to do the same thing in python .But doable.



In [ ]:

	userid	age	dob_day	dob_year	dob_month	gender	tenure
0	2094382	14	19	1999	11	male	266.0
1	1192601	14	2	1999	11	female	6.0
2	2083884	14	16	1999	11	male	13.0
3	1203168	14	25	1999	12	female	93.0
4	1733186	14	4	1999	12	male	82.0

	age	friend_count_mean	friend_count_median	n
0	13	164.750000	74.0	484
1	14	251.390130	132.0	1925
2	15	347.692131	161.0	2618
3	16	351.937135	171.5	3086
4	17	350.300640	156.0	3283

	mean	median	len
age_with_months
13.166667	46.333333	30.5	6
13.250000	115.071429	23.5	14
13.333333	136.200000	44.0	25
13.416667	164.242424	72.0	33
13.500000	131.177778	66.0	45