In [8]:
# Import stuff
import rpy2.interactive
import rpy2.interactive.packages
%load_ext rpy2.ipython
# Directly convert objects from pandas to r and vsv
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
#Set the global figure size
plt.rcParams['figure.figsize'] = (8.0, 8.0)
In [7]:
%%R
# Load R libaries
library(ggplot2)
library(dplyr)
In [2]:
%ls
In [ ]:
#let's move the correct data file over from lesson 3 and use some bash knowledge
!cp Data-Science/dand/exploratory_data_analysis/lesson3/pseudo_facebook.tsv
Data-Science/dand/exploratory_data_analysis/lesson4/
# I dit it in bash
In [3]:
%ls #yep
In [3]:
%%R
#load it in R
pf <-read.csv('pseudo_facebook.tsv', sep='\t')
head(pf)
In [6]:
%R names(pf)
Out[6]:
In [8]:
%%R
summary(pf)
In [9]:
%%R
str(pf)
In [4]:
pf = pd.read_csv('pseudo_facebook.tsv', sep='\t')
pf.head()
Out[4]:
In [18]:
pf.info()
In [5]:
#Make gender categorical
pf['gender'] = pf['gender'].astype('category')
In [20]:
pf.info()
In [21]:
pf.gender
Out[21]:
In [14]:
pf['gender'] = pf['gender'].cat.codes
In [15]:
pf.info()
In [16]:
pf.gender # it turrns them into numbers but without retainning the string information
Out[16]:
In [22]:
pf.gender.cat.categories #There they are!
Out[22]:
In [24]:
pf.gender.cat.codes
Out[24]:
In [25]:
# This seems to work
pd.factorize(pf.gender)
Out[25]:
In [26]:
pf['gender'] = pd.factorize(pf.gender) # This does not work because they have different indexes
In [27]:
%%R
qplot(x=age, y=friend_count, data=pf)
In [42]:
%%R
#Find the limits
summary(pf$age)
In [39]:
%%R
ggplot(aes(x=age, y=friend_count), data=pf) + geom_point() +
xlim(13, 90)
In [45]:
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point(alpha = 1/20) +
xlim(13, 90)
In [48]:
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_jitter(alpha = 1/20) +
xlim(13, 90)
In [61]:
%%R
ggplot(aes(x=age, y=friend_count), data=pf) +
geom_point(alpha = 1/20) +
xlim(13, 90) +
coord_trans(y = 'sqrt')
In [ ]:
%%R
ggplot(aes(x=age, y=friendships_initiated), data=pf) +
geom_point(alpha = 1/20) +
xlim(13, 90) +
coord_trans(y = 'sqrt')
In [36]:
pf.plot.scatter(x='age', y='friend_count', figsize=(8,8));
Three distinct bands of higher fiend counts below 30, at 69 and above 100
In [43]:
pf.age.describe()
Out[43]:
In [44]:
pf.plot.scatter(x='age', y='friend_count', figsize=(8,8))
plt.xlim(13,90);
In [47]:
pf.plot.scatter(x='age', y='friend_count',alpha=1/20, figsize=(8,8))
plt.xlim(13,90);
The bulk of friend counts are below 1000
In [72]:
ax = sns.swarmplot(x='age', y='friend_count',data=pf, alpha=1/20) # something ain't right here
ax.set_xlim(13,90)
plt.figure(figsize=(8,8)); #It takes a lot of time, does not recognize the xlim and it doesn't finish
A few useful thoughts about Jitter here: https://github.com/matplotlib/matplotlib/issues/2750
In [70]:
pf['sq_friend_count'] = np.sqrt(pf.friend_count)
ax = pf.plot.scatter(x='age', y='sq_friend_count', alpha=1/20, figsize=(8,8))
plt.xlim(13,90); # The Y
# There does not seem to exist a equivalennt here although the result is quite interesting
# You probably have to do it by hand
Out[70]:
In [31]:
%%R
age_groups <- group_by(pf, age)
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age <- arrange(pf.fc_by_age, age)
head(pf.fc_by_age)
In [98]:
%%R
## Alternative
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age)
In [52]:
grouped = pf.groupby('age')
pf_c_by_age = grouped['friend_count'].agg([np.mean, np.median, len])
# pf_c_by_age = pd.DataFrame([pf_c_by_age.index, pf_c_by_age.mean, pf_c_by_age.median, pf_c_by_age.len],
# columns=['age','friend_count_mean', 'friend_count_median', 'n'])
pf_c_by_age = pd.DataFrame({'age': pf_c_by_age.index,
'friend_count_mean': pf_c_by_age['mean'],
'friend_count_median': pf_c_by_age['median'],
'n': pf_c_by_age['len']}).reset_index(drop=True) # this one will ake the index and make it a column if drop=False
pf_c_by_age.sort_values('age')
pf_c_by_age.head()
Out[52]:
In [99]:
%%R
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
geom_point(alpha = 1/1) +
xlim(13, 90)
In [103]:
pf_c_by_age.plot.scatter(x='age', y ='friend_count_mean');
In [100]:
%%R
# With lines
ggplot(aes(x=age, y=friend_count_mean), data=pf.fc_by_age) +
geom_line(alpha = 1/1) +
xlim(13, 90)
In [104]:
pf_c_by_age.plot.line(x='age', y ='friend_count_mean');
In [105]:
%%R
ggplot(aes(x = age, y=friend_count), data=pf) +
xlim(13,90) +
geom_point(alpha=0.05,
position=position_jitter(h=0),
color='orange') +
coord_trans(y='sqrt') +
geom_line(stat='summary', fun.y=mean) +
geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .1),
linetype=2, color='blue') +
geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .5),
color='blue') +
geom_line(stat='summary', fun.y=quantile, fun.args = list(probs = .9),
linetype=2, color='blue')
In [107]:
# Try to get the sqrt scale
import matplotlib.scale as mscale
import matplotlib.pyplot as plt
import matplotlib.transforms as mtransforms
import matplotlib.ticker as ticker
import numpy as np
class SquareRootScale(mscale.ScaleBase):
"""
ScaleBase class for generating square root scale.
"""
name = 'squareroot'
def __init__(self, axis, **kwargs):
mscale.ScaleBase.__init__(self)
def set_default_locators_and_formatters(self, axis):
axis.set_major_locator(ticker.AutoLocator())
axis.set_major_formatter(ticker.ScalarFormatter())
axis.set_minor_locator(ticker.NullLocator())
axis.set_minor_formatter(ticker.NullFormatter())
def limit_range_for_scale(self, vmin, vmax, minpos):
return max(0., vmin), vmax
class SquareRootTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
def transform_non_affine(self, a):
return np.array(a)**0.5
def inverted(self):
return SquareRootScale.InvertedSquareRootTransform()
class InvertedSquareRootTransform(mtransforms.Transform):
input_dims = 1
output_dims = 1
is_separable = True
def transform(self, a):
return np.array(a)**2
def inverted(self):
return SquareRootScale.SquareRootTransform()
def get_transform(self):
return self.SquareRootTransform()
mscale.register_scale(SquareRootScale)
In [109]:
fig, ax = plt.subplots(1)
ax.set_yscale('squareroot')
pf.plot.scatter(x='age', y='friend_count', alpha=1/20, ax=ax)
plt.xlim(13,90)
# Doesn't work
Out[109]:
In [161]:
fig, ax = plt.subplots(1)
ax.set_yscale('symlog')
pf.plot.scatter(x='age', y='friend_count', alpha=1/20,color='orange', ax=ax)
pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='blue', ax=ax)
#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='blue',style='--', ax=ax)
plt.xlim(13,90);
plt.ylim(ymin=1);
In [162]:
# Without the axis scaling
fig, ax = plt.subplots(1)
# ax.set_yscale('symlog')
pf.plot.scatter(x='age', y='friend_count', alpha=1/20,color='orange', ax=ax)
pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='blue', ax=ax)
#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='blue',style='--', ax=ax)
plt.xlim(13,90);
# plt.ylim(ymin=1);
In [166]:
# zooming in
fig, ax = plt.subplots(1)
# ax.set_yscale('symlog')
pf.plot.scatter(x='age', y='friend_count', alpha=1/10,color='orange', ax=ax)
pf_c_by_age.plot.line(x='age', y='friend_count_mean',color='black', ax=ax)
#Ploting the quantile on the fly
pf.groupby('age').friend_count.quantile(0.1).plot.line(color='blue',style='--', ax=ax)
pf.groupby('age').friend_count.quantile(0.5).plot.line(color='blue',style='-', ax=ax)
pf.groupby('age').friend_count.quantile(0.9).plot.line(color='red',style='--', ax=ax)
plt.xlim(13,90)
plt.ylim(0,1000);
I am still missing the upper part of the y-range since I cannot scale the axis in python in sqrt the same way
In [168]:
%%R
library(stats)
cor.test(pf$age, pf$friend_count)
In [169]:
%%R
# Alternative
with(pf, cor.test(age, friend_count))
In [172]:
pf[['age','friend_count']].corr()
Out[172]:
In [173]:
pf.age.corr(pf.friend_count)
Out[173]:
In [174]:
np.corrcoef(pf.age, pf.friend_count)
Out[174]:
In [175]:
np.correlate(pf.age, pf.friend_count) ##??
Out[175]:
In [176]:
%%R
with(subset(pf, age<=70), cor.test(age, friend_count))
In [178]:
pf.age[pf.age<=70].corr(pf.friend_count)
Out[178]:
In [179]:
%%R
with(subset(pf, age<=70), cor.test(age, friend_count, method = 'spearman'))
In [180]:
pf.age[pf.age<=70].corr(pf.friend_count, method='spearman')
Out[180]:
In [182]:
%%R
ggplot(aes(x = www_likes_received, y=likes_received), data=pf) +
geom_point()
In [185]:
pf.plot.scatter(x = 'www_likes_received', y='likes_received');
In [207]:
%%R
ggplot(aes(x=www_likes_received, y=likes_received), data=pf) +
geom_point() +
xlim(0, quantile(pf$www_likes_received, 0.95)) +
ylim(0, quantile(pf$likes_received, 0.95)) +
geom_smooth(method = 'lm', color = 'red')
In [216]:
fig, ax = plt.subplots(1)
pf.plot.scatter(x = 'www_likes_received', y='likes_received', ax=ax)
# pf.plot.line(x = 'www_likes_received', y='likes_received', ax=ax, color='red') # Doesn't work
plt.xlim(0, pf.www_likes_received.quantile(.95))
plt.ylim(0, pf.likes_received.quantile(.95));
In [240]:
# Have to convert the training X to 2d dimesional array first
X = pf.www_likes_received.values.reshape(len(pf.www_likes_received), 1)
X
Out[240]:
In [245]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(X, pf.likes_received)
fig, ax = plt.subplots(1)
pf.plot.scatter(x = 'www_likes_received', y='likes_received', ax=ax)
plt.plot(X, reg.predict(X), color='red')
# pf.plot.line(x = 'www_likes_received', y='likes_received', ax=ax, color='red') # Doesn't work
plt.xlim(0, pf.www_likes_received.quantile(.95))
plt.ylim(0, pf.likes_received.quantile(.95));
Seaborn is the faster solution in this case
In [206]:
ax = sns.regplot(x = 'www_likes_received', y='likes_received', data=pf, line_kws = {'color': 'red'},
robust=True, ci=None)
sns.plt.xlim(0, pf.www_likes_received.quantile(.95))
sns.plt.ylim(0, pf.likes_received.quantile(.95));
In [191]:
from ggplot import *
In [227]:
ggplot(aes(x='www_likes_received', y='likes_received'), data=pf) + \
geom_point() + \
xlim(0, pf.www_likes_received.quantile(.95)) + \
ylim(0, pf.likes_received.quantile(.95)) + \
stat_smooth(color = 'red') ## Yeay!!
Out[227]:
Python-ggplot example:
In [198]:
meat_lng = pd.melt(meat[['date', 'beef', 'pork', 'broilers']], id_vars='date')
ggplot(aes(x='date', y='value', colour='variable'), data=meat_lng) + \
geom_point() + \
stat_smooth(color='red')
Out[198]:
In [246]:
%%R
with(pf, cor.test(www_likes_received, likes_received, method = 'pearson'))
In [259]:
import scipy
c = scipy.stats.pearsonr(pf.www_likes_received, pf.likes_received)[0]
round(c,3)
Out[259]:
In [266]:
%ls
In [9]:
%%R
Mitchell = read.csv('Mitchell.csv')
head(Mitchell)
In [10]:
mitchell = r.Mitchell
In [271]:
%%R
ggplot(aes(x=Month, y=Temp), data=Mitchell) +
geom_point()
In [272]:
#Let's use the R object just for fun
r.Mitchell.head()
Out[272]:
In [275]:
r.Mitchell.plot.scatter(x='Month', y='Temp');
a. Take a guess for the correlation coefficient for the scatterplot. 0.5
b. What is the actual correlation of the two variables? (Round to the thousandths place) 0.057
In [276]:
%%R
with(Mitchell, cor(Month, Temp))
with(Mitchell, cor.test(Month, Temp))
In [278]:
scipy.stats.pearsonr(mitchell.Month, mitchell.Temp)
Out[278]:
In [283]:
%%R
range(Mitchell$Month)
In [281]:
%%R
ggplot(aes(x=Month, y=Temp), data=Mitchell) +
geom_point() +
scale_x_continuous(breaks = seq(0, 203, 12))
In [289]:
r.Mitchell.Month.describe().loc[['min', 'max']]
Out[289]:
In [291]:
# Alternaive
r.Mitchell.Month.ptp() # Nope, returns the difference of values
Out[291]:
In [280]:
r.Mitchell.plot.scatter(x='Month', y='Temp')
plt.xticks(range(0,203,12));
In [292]:
%%R
#Detect the yearly flunctuation in the data
ggplot(aes(x=(Month%%12),y=Temp), data=Mitchell)+
geom_point()
In [300]:
r.Mitchell.plot.scatter(x='Month', y='Temp', figsize=(24,8))
plt.xticks(range(0,203,12));
John Tukey
William Playfair
William Playfair and the Psychology of Graphs
There are other measures of associations that can detect this. The dcor.ttest() function in the energy package implements a non-parametric test of the independence of two variables. While the Mitchell soil dataset is too coarse to identify a significant dependency between "Month" and "Temp", we can see the difference between dcor.ttest and cor.test through other examples, like the following:
x <- seq(0, 4*pi, pi/20)
y <- cos(x)
qplot(x = x, y = y)
dcor.ttest(x, y)
There is a yearly cyclical pattern here that is obvious if we stretch the plot.
We can also overrlay each yearr and see the yearly pattern like follows:
In [304]:
mitchell['monthperyear'] = mitchell.Month%12
mitchell.plot.scatter(x='monthperyear', y='Temp');
In [12]:
%%R
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
head(pf)
In [13]:
pf['age_with_months'] = pf.age + (12 - pf.dob_month) / 12
pf.head()
Out[13]:
In [ ]:
# Create a new data frame called
# pf.fc_by_age_months that contains
# the mean friend count, the median friend
# count, and the number of users in each
# group of age_with_months. The rows of the
# data framed should be arranged in increasing
# order by the age_with_months variable.
# For example, the first two rows of the resulting
# data frame would look something like...
# age_with_months friend_count_mean friend_count_median n
# 13 275.0000 275 2
# 13.25000 133.2000 101 11
# See the Instructor Notes for two hints if you get stuck.
# This programming assignment will automatically be graded.
In [15]:
%%R
fc_by_age_months_groups <- group_by(pf, age_with_months)
pf.fc_by_age_months <- summarise(fc_by_age_months_groups ,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
pf.fc_by_age_monthse <- arrange(pf.fc_by_age_months)
head(pf.fc_by_age_months)
In [17]:
fc_by_age_months_grouped = pf.groupby('age_with_months')
fc_by_age_months = fc_by_age_months_grouped.friend_count.aggregate([np.mean, np.median, len])
fc_by_age_months.head()
Out[17]:
In [25]:
#alternative
fc_by_age_months_grouped = pf.groupby('age_with_months')
fc_by_age_months = fc_by_age_months_grouped.friend_count.aggregate([np.mean, np.median, len])
fc_by_age_months = pd.DataFrame({'age_with_months':fc_by_age_months.index,
'friend_count_mean': fc_by_age_months['mean'],
'friend_count_median': fc_by_age_months['median'],
'n': fc_by_age_months['len']}).reset_index(drop=True)
fc_by_age_months.head()
Out[25]:
In [26]:
%%R
# Create a new line plot showing friend_count_mean versus the new variable,
# age_with_months. Be sure to use the correct data frame (the one you created
# in the last exercise) AND subset the data to investigate users with ages less
# than 71.
ggplot(aes(x = age_with_months, y = friend_count_mean), data = subset(pf.fc_by_age_months, age_with_months<71)) +
geom_line()
In [29]:
fc_by_age_months[fc_by_age_months.age_with_months<71].plot(x = 'age_with_months', y = 'friend_count_mean');
In [32]:
%%R
p1 <- ggplot(aes(x = age, y = friend_count_mean),
data = subset(pf.fc_by_age, age<71)) +
geom_line() +
geom_smooth()
p2 <- ggplot(aes(x = age_with_months, y = friend_count_mean),
data = subset(pf.fc_by_age_months, age_with_months<71)) +
geom_line() +
geom_smooth()
p3 <- ggplot(aes(x = round(age/5)*5, y = friend_count),
data = subset(pf, age<71)) +
geom_line(stat = 'summary', fun.y = mean)
library(gridExtra)
grid.arrange(p2,p1,p3, ncol = 1)
In [59]:
fig, (ax1,ax2,ax3) = plt.subplots(3,1)
#This is a small hack because i will combine seaborn's regrression line with a plot from plt
fc_by_age_months[fc_by_age_months.age_with_months<71].plot(x = 'age_with_months', y = 'friend_count_mean', ax=ax1);
sns.regplot(x = 'age_with_months', y = 'friend_count_mean', ax=ax1,
data = fc_by_age_months[fc_by_age_months.age_with_months<71], scatter = False, order=2, color='black')
pf_c_by_age[pf_c_by_age.age<71].plot(x = 'age', y = 'friend_count_mean', ax=ax2);
sns.regplot(x = 'age', y = 'friend_count_mean', ax=ax2,
data = pf_c_by_age[pf_c_by_age.age<71], scatter = False, order=2, color='black')
pf = pf[pf.age<71]
pf['round_age'] = np.round(pf.age/5)*5
y = pf.groupby('round_age').mean().friend_count
x = pf.groupby('round_age').mean().index
ax3.plot(x,y);
Thing are soooo more complicated if ones tries to do the same thing in python .But doable.
In [ ]: