In [116]:
import rpy2.interactive
import rpy2.interactive.packages
%load_ext rpy2.ipython
# Directly convert objects from pandas to r and vsv
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn-ticks')
#Set the global figure size
plt.rcParams['figure.figsize'] = (8.0, 8.0)
import warnings
warnings.filterwarnings('ignore')
import plotly.tools as tls
import cufflinks as cf
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
from ggplot import *
In [2]:
%%R
library(ggplot2)
library(dplyr)
library(gridExtra)
library(plotly)
In [126]:
%%R
pf <-read.csv('~/Data-Science/dand/exploratory_data_analysis/lesson3/pseudo_facebook.tsv', sep='\t')
head(pf)
names(pf)
summary(pf)
In [5]:
%R summ <- str(pf)
Trying to capture the output of str in R but it does not work
In [6]:
%R summ
In [127]:
pf = pd.read_csv('~/Data-Science/dand/exploratory_data_analysis/lesson3/pseudo_facebook.tsv', sep='\t')
pf.head()
Out[127]:
In [8]:
pf.describe()
Out[8]:
It is not really possible to transfer efficient objects form python to R. It only works from R->python
the solution is to save to disk and have R read it from there.
Let's give it a try
In [9]:
tpf = pf.copy()
tpf.to_csv('tpf', index=False)
In [10]:
%%R
tpf <-read.csv('tpf', header=TRUE, sep=",", dec=",")
In [11]:
%%R
str(tpf)
In [12]:
%ls
In [13]:
%%R
ggplot(aes(x = friend_count, color=age),
data = subset(pf, !is.na(gender))) + geom_histogram() +
facet_wrap(~gender, ncol=2)
In [14]:
pf.hist(column='friend_count', by='gender');
In [15]:
%%R
# Make two boxplots
ggplot(aes(x = gender, y = age),
data = subset(pf, !is.na(gender))) +
geom_boxplot() +
#add the mean as an x (shape = 4)
stat_summary(fun.y = mean, geom = 'point', shape = 4)
In [16]:
sns.boxplot(x='gender', y='age', data=pf);
In [17]:
%%R
# Make a lineplot
ggplot(aes(x = age, y = friend_count),
data = subset(pf, !is.na(gender))) +
geom_line(aes(color=gender), stat = 'summary', fun.y = median)
In [18]:
fig, ax = plt.subplots(1)
pf[pf.gender=='male'].groupby('age').friend_count.mean().plot.line(x='age', y='friend_count', ax=ax, color='lightblue')
pf[pf.gender=='female'].groupby('age').friend_count.mean().plot.line(x='age', y='friend_count', ax=ax, color='pink')
ax.legend(('male', 'female'));
Write code to create a new data frame, called 'pf.fc_by_age_gender', that contains information on each age AND gender group.
The data frame should contain the following variables:
mean_friend_count,
median_friend_count,
n (the number of users in each age and gender grouping)
Here is an example of the structure of your data frame. Your data values will be different. Note that if you are grouping by more than one variable, you will probably need to call the ungroup() function.
age gender mean_friend_count median_friend_count n
1 13 female 247.2953 150 207 2 13 male 184.2342 61 265 3 14 female 329.1938 245 834 4 14 male 157.1204 88 1201
In [128]:
%%R
pf.fc_by_age_gender <- group_by(subset(pf, !is.na(gender)), age, gender)
pf.fc_by_age_gender <- summarise(pf.fc_by_age_gender,
mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n = n())
#pf.fc_by_age_gender = ungroup(pf.fc_by_age_gender)
pf.fc_by_age_gender
In [129]:
%%R
pf.fc_by_age_gender <- pf %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(friend_count),
n())
# ungroup() It looks like it is not necessary
pf.fc_by_age_gender
In [130]:
# In Pandas - I found a way to name the columns directly
pf_grouped = pf.groupby(['age', 'gender'])
pf_fc_by_age_gender = pf_grouped.friend_count.agg({'mean_friend_count': np.mean,
'median_friend_count': np.median,
'n': len})
pf_fc_by_age_gender.head()
Out[130]:
In [22]:
%%R
ggplot(aes(x=age, y=mean_friend_count, color = gender), data = pf.fc_by_age_gender) +
geom_line()
In [23]:
pf_fc_by_age_gender.index.levels[1]
Out[23]:
In [24]:
#Pandas - The gender subindex has to be unstacked first
ax = pf_fc_by_age_gender.mean_friend_count.unstack(level=1).plot(subplots=False)
In [102]:
# And this is how it looks unstacked
pf_fc_by_age_gender.mean_friend_count.unstack(level=1).head()
Out[102]:
In [131]:
%%R
library(reshape2)
pf.fc_by_age_gender.wide <- dcast(pf.fc_by_age_gender,
age ~ gender,
value.var = 'median_friend_count')
head(pf.fc_by_age_gender.wide)
In [104]:
# Pivot_table aggregates with mean as default
pf.pivot_table(index='age', columns='gender', values='friend_count', aggfunc=np.median).head()
Out[104]:
In [105]:
#It is possible to do this with crosstab as well
pd.crosstab(pf.age, pf.gender, values=pf.friend_count, aggfunc=np.median).head()
# Note that we start at the original dataframe here
Out[105]:
Plot the ratio of the female to male median friend counts using the data frame pf.fc_by_age_gender.wide.
Think about what geom you should use. Add a horizontal line to the plot with a y intercept of 1, which will be the base line. Look up the documentation for geom_hline to do that. Use the parameter linetype in geom_hline to make the line dashed.
The linetype parameter can take the values 0-6: 0 = blank, 1 = solid, 2 = dashed 3 = dotted, 4 = dotdash, 5 = longdash 6 = twodash
In [29]:
%%R
ggplot(aes(x = age, y = female/male), data = pf.fc_by_age_gender.wide) +
geom_line() +
#create a reference line over at 1
geom_hline(yintercept = 1, alpha = 0.3, linetype = 2)
In [132]:
#Pandas
pf_wide = pd.crosstab(pf.age, pf.gender, values=pf.friend_count, aggfunc=np.median)
#Let's try dividing it directly
ax = (pf_wide.female/pf_wide.male).plot()
#plot an intercept line
ax.hlines(y=1,xmin=0, xmax=150, linestyles='dashed', alpha=0.3 );
In [133]:
%%R
pf$year_joined <- floor(2014 - pf$tenure/360)
head(pf)
In [134]:
#Pandas
pf['year_joined'] = np.floor(2014 - pf.tenure/360)
#make it an integer - it can not be done, see: https://pandas.pydata.org/pandas-docs/stable/gotchas.html
pf.loc[~pf.year_joined.isnull(), 'year_joined'] = pf.year_joined[~pf.year_joined.isnull()].astype(int)
pf.head()
Out[134]:
In [33]:
%%R
summary(pf$year_joined)
In [34]:
pf.year_joined.describe()
Out[34]:
In [35]:
%%R
#get the count per year
table(pf$year_joined)
In [36]:
#Pandas
table = pf.year_joined.value_counts().sort_index()
table
Out[36]:
Create a new variable in the data frame called year_joined.bucket by using the cut function on the variable year_joined.
You need to create the following buckets for the new variable, year_joined.bucket
(2004, 2009]
(2009, 2011]
(2011, 2012]
(2012, 2014]
Note that a parenthesis means exclude the year and a bracket means include the year.
In [135]:
%%R
pf$year_joined.bucket = cut(pf$year_joined, c(2004, 2009, 2011, 2012, 2014))
table(pf$year_joined.bucket)
In [136]:
#pandas
pf['year_joined_bucket'] = pd.cut(pf.year_joined, (2004, 2009, 2011, 2012, 2014))
pf.year_joined_bucket.value_counts(sort=False)
Out[136]:
In [39]:
from bqplot import pyplot as plt
from bqplot import *
In [40]:
pf_age_ratio = pf_wide.female/pf_wide.male
In [41]:
tp = Tooltip(fields = (list(pf_age_ratio.index), list(pf_age_ratio)), labels=['y', 'x'] )
plt.plot(pf_age_ratio, tooltip=tp)
plt.show()
In [43]:
# Create a trace
trace = go.Scatter(
x = pf_age_ratio.index,
y = pf_age_ratio,
name='Gender Ratio',
# text='age',
hoverinfo = 'all'
)
data = [trace]
layout = dict(
title='Line Plot with range slider and selectors',
xaxis=dict(
rangeslider=dict(),
),
shapes= [
# Line Horizontal
{
'type': 'line',
'x0': 0,
'y0': 1,
'x1': pf_age_ratio.index.max(),
'y1': 1,
'line': {
'color': 'black',
'width': 0.8,
'dash': 'dash',
}}
]
)
fig = dict(data=data, layout=layout)
plotly.offline.iplot(fig)
Create a line graph of friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. This means you should have four different lines on your plot.
You should subset the data to exclude the users whose year_joined.bucket is NA.
In [44]:
%%R
ggplot(aes(x=age, y=friend_count), data=subset(pf,!is.na(pf$year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat = 'summary', fun.y = median)
In [45]:
#pandas - The best way is by using pivot_table
pf.pivot_table(index='age', columns='year_joined_bucket', values='friend_count', aggfunc='median').plot();
In [108]:
#let's take a look at the table
pf.pivot_table(index='age', columns='year_joined_bucket', values='friend_count', aggfunc='median').head()
Out[108]:
In [47]:
pfg = pf.pivot_table(index='age', columns='year_joined_bucket', values='friend_count', aggfunc='median')
pfg.columns
Out[47]:
In [48]:
pfg.columns[0]
Out[48]:
In [49]:
# Create and style traces
trace0 = go.Scatter(
x = pfg.index,
y = pfg[pfg.columns[0]],
name = str(pfg.columns[0]),
line = dict(
color = ('blue'),
width = 1)
)
trace1 = go.Scatter(
x = pfg.index,
y = pfg[pfg.columns[1]],
name = str(pfg.columns[1]),
line = dict(
color = ('green'),
width = 1)
)
trace2 = go.Scatter(
x = pfg.index,
y = pfg[pfg.columns[2]],
name = str(pfg.columns[2]),
line = dict(
color = ('red'),
width = 1)
)
trace3 = go.Scatter(
x = pfg.index,
y = pfg[pfg.columns[3]],
name = str(pfg.columns[3]),
line = dict(
color = ('purple'),
width = 1)
)
data = [trace0, trace1, trace2, trace3]
# Edit the layout
layout = dict(title = 'Median friend count per year_joined_bucket',
xaxis = dict(title = 'Age'),
yaxis = dict(title = 'Friend count')
)
fig = dict(data=data, layout=layout)
#Use plot instead of iplot if you want to save the html file
plotly.offline.iplot(fig, show_link=False)
In [50]:
# A simpler Alternative
plotly.offline.iplot([{
'x': pfg.index,
'y': pfg[col],
'name': str(col)
} for col in pfg.columns], show_link=False, filename='plot.html')
In [52]:
#That works pretty well too
pfg.iplot(theme='white', colorscale='set2', fill=False)
In [53]:
#Subplots!
pfg.iplot(subplots=True, offline_show_link=False, filename='subplots.html')
In [54]:
#Get cufflink themes
cf.colors.scales()
In [55]:
fig = pfg.iplot(subplots=True, offline_show_link=False, asFigure=True )
plotly.offline.plot(fig,filename='subplots.html')
Out[55]:
In [56]:
#Check out the matplotlib option - good but nes tweaking to work properly
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1)
pf[pf.gender=='male'].groupby('age').friend_count.mean().plot.line(x='age', y='friend_count', ax=ax, color='lightblue')
pf[pf.gender=='female'].groupby('age').friend_count.mean().plot.line(x='age', y='friend_count', ax=ax, color='pink')
ax.legend(('male', 'female'))
plotly.offline.iplot_mpl(fig, resize=True, strip_style=True)
In [57]:
#how is it doing with seaborn? Does not work very well
fig = sns.boxplot(x='gender', y='age', data=pf).get_figure()
plotly.offline.iplot_mpl(fig);
Write code to do the following:
(1) Add another geom_line to code below to plot the grand mean of the friend count vs age.
(2) Exclude any users whose year_joined.bucket is NA.
(3) Use a different line type for the grand mean.
As a reminder, the parameter linetype can take the values 0-6:
0 = blank, 1 = solid, 2 = dashed 3 = dotted, 4 = dotdash, 5 = longdash 6 = twodash
In [58]:
%%R
#Plot
p= ggplot(aes(x=age, y=friend_count), data=subset(pf,!is.na(pf$year_joined.bucket))) +
geom_line(aes(color=year_joined.bucket), stat = 'summary', fun.y = mean) +
geom_line(aes(),linetype = 4, stat = 'summary', fun.y=mean) +
theme_light()
#transform to plotly object
p = ggplotly(p)
#save as html
htmlwidgets::saveWidget(as.widget(p), "r-plot.html")
In [59]:
from IPython.display import IFrame
IFrame('r-plot.html', width=800, height=600)
Out[59]:
In [60]:
%%R
s = with(subset(pf, tenure >=1), summary(friend_count / tenure))
s
In [61]:
%R -o s
In [62]:
s
Out[62]:
In [63]:
#Pandas
pf.query('tenure>=1') == pf[pf.tenure>=1]
Out[63]:
In [64]:
from pandasql import *
In [65]:
#provide acces to the globals
pysqldf = lambda q: sqldf(q, globals())
In [67]:
%%script false
#make query
q = '''
SELECT
*
FROM
pf
WHERE
pf.tenure>=1;
'''
sqldf(q, locals())
Not so simple! will come back to this
In [68]:
#let's head back to our quiz
#Have to aggregate first!
pf_ag = pf[pf.tenure>=1]
(pf_ag.friend_count/pf_ag.tenure).describe()
Out[68]:
Create a line graph of mean of friendships_initiated per day (of tenure) vs. tenure colored by year_joined.bucket.
You need to make use of the variables tenure, friendships_initiated, and year_joined.bucket.
You also need to subset the data to only consider user with at least one day of tenure.
In [69]:
%%R
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_line(aes(color = year_joined.bucket),
stat = 'summary',
fun.y = mean)
p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",show.legend = FALSE,
fun.y = mean)
p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",show.legend = FALSE,
fun.y = mean)
p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
data = subset(pf, tenure > 0)) +
geom_line(aes(color = year_joined.bucket),
stat = "summary",show.legend = FALSE,
fun.y = mean) +
guides(line=FALSE)
subplot(p1, p2, p3, p4, nrows = 4)
sub <- subplot(p1, p2, p3, p4, nrows = 4)
htmlwidgets::saveWidget(as.widget(sub), "subplots.html")
In [70]:
IFrame('subplots.html', width=1000, height=600)
Out[70]:
Not really what I wanted to see although the hover labels are cool. Have to find a way to show only one legend
In [71]:
#Pandas
#make a new col for the friend rate
pf_ag = pf[pf.tenure>=1]
pf['friend_rate'] = (pf_ag.friend_count/pf_ag.tenure)
#cast the special type as a string
pf['year_joined_bucket'] = pf['year_joined_bucket'].astype(str)
# Unique category labels
color_labels = pf['year_joined_bucket'].unique()
# List colors
rgb_values = ['purple','red', 'green','blue']
# Map label to RGB
color_map = dict(zip(color_labels, rgb_values))
#create color column
pf['color'] = pf['year_joined_bucket'].map(color_map)
smoothers = [1,7,30,90]
fig, axes = plt.subplots(nrows=4)
for ax, smoother in zip(axes, smoothers):
pf1 = pf.copy()
pf1['tenure'] = smoother * round(pf1.tenure / smoother)
pfg = pf1.pivot_table(values='friend_rate', index='tenure', columns='color', aggfunc='mean')
pfg.plot(ax=ax, legend=False);
plotly.offline.iplot_mpl(fig, resize=True, strip_style=False)
I need to fix the labels , but that ain't so bad
In [72]:
%%R
p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
data = subset(pf, tenure >= 1)) +
geom_smooth(aes(color = year_joined.bucket))
p1
htmlwidgets::saveWidget(as.widget(ggplotly(p1)), "smooth.html")
In [73]:
IFrame('smooth.html', width=1000, height=600)
Out[73]:
No real alternative to geom smooth in python - except ggplot of course - will have to fill that in here
In [3]:
%%R
yo <- read.csv('/home/jkb/Data-Science/dand/exploratory_data_analysis/lesson5/yogurt.csv')
#change id to a factor variable
yo$id = factor(yo$id)
In [4]:
%R str(yo)
In [5]:
%R head(yo)
Out[5]:
In [6]:
%%R
ggplot(aes(x=price), data = yo) +
geom_histogram()
In [7]:
#Move the dataframe to pandas
%R -o yo
In [8]:
yo.head()
Out[8]:
In [9]:
yo.describe()
Out[9]:
In [10]:
yo.info()
In [23]:
p = yo.hist('price', bins=50);
In [22]:
count, division = np.histogram(yo.price, bins = 50)
count,division
Out[22]:
In [27]:
count.size, division.size
Out[27]:
In [38]:
#or this is better?
yo.price.value_counts(sort=False).sort_index()
Out[38]:
I shouldn't have cut off the left value
I R you could do it like this
In [42]:
%%R
table(yo$price)
In [31]:
#Turn this into a pandas dataframe to see it better - #we have to slice the first value of division of
# (the left border of the bins bucket) tto have equally sized arrays
hist = pd.DataFrame(data={'division': division[1:],
'count': count
})
hist
Out[31]:
In [33]:
yo.price.iplot(kind='hist', subplots=True)
In [91]:
yo.nunique()
Out[91]:
In [32]:
%%R
#they used factor - a funky way to make a column calculation
# let's try to get the column names progrmatically in R
columns <- names(yo)[4:8]
#one option
yo$all.purchases <- apply(yo[,c(4:8)], 1, sum)
head(yo)
In [13]:
#Pandas
yo['all_purchases'] = yo.iloc[:, 3:8].apply(np.sum, axis=1)
yo.head(6)
Out[13]:
In [45]:
#Pandas more pythonic alternative
yo['all_purchases'] = pd.Series(sum([yo[col] for col in yo.columns[3:8]]))
yo.head(6)
Out[45]:
%%R ggplot(aes(x=all.purchases), data = yo) + geom_histogram()
In [46]:
#pandas
yo.all_purchases.value_counts().sort_index()
Out[46]:
In [48]:
#plotly
yo.all_purchases.iplot(kind='hist')
In [53]:
yo.iloc[:,3:].iplot(kind='hist', subplots=True)
In [54]:
%%R
p <- ggplot(aes(y=price, x=time), data=yo) +
geom_point(alpha=1/4)
p
In [55]:
%%R
#Udacity's solution
p <- ggplot(aes(y=price, x=time), data=yo) +
geom_jitter(alpha=1/4, shape=21)
p
In [57]:
#pandas - seaborn
sns.stripplot(x='time', y='price', data=yo, jitter=True, alpha=1/4)
Out[57]:
In [63]:
#pandas - seaborn
sns.regplot(x='time', y='price', data=yo, x_jitter=.1, fit_reg=False, scatter_kws={'alpha':1/4})
Out[63]:
In [65]:
#pandas vanilla
yo.plot(kind='scatter', x='time', y='price', alpha=1/4);
In [73]:
#plotly
yo.iplot(kind='scatter', x='time', y='price', mode='markers+text', size=5);
In [74]:
%%R
set.seed(4230)
sample.ids <- sample(levels(yo$id), 16)
sample.ids
ggplot(aes(x=time, y=price),
data = subset(yo, id %in% sample.ids)) +
facet_wrap(~id) +
geom_line() +
geom_point(aes(size=all.purchases), pch=1)
In [82]:
#pandas
np.random.seed(432)
sample_ids = np.random.choice(yo.id, 16)
sample_ids
yos = yo[yo.id.isin(sample_ids)]
yos.head()
Out[82]:
In [88]:
#factorplot
g = sns.factorplot(x="time", y='price',col_wrap=4,
col="id", data=yos,
size=5, aspect=.8)
In [91]:
#facetgrid
g = sns.FacetGrid(yos, col="id", col_wrap=4, size=3)
g = g.map(plt.plot, "time", "price", marker=".")
There does not seem to be a simple solution to set the size of the dots automatically based on a variable. Looping through the variable and assigning a value should do it. Should see if ggplot for python can do that.
In [121]:
#ggplot - got it!
ggplot(aes(x='time', y='price', size='all_purchases'), data = yos) +\
facet_wrap('id') +\
geom_line(size=1) +\
geom_point(pch=1)+\
geom_point(color='w', size=10)
Out[121]:
In [137]:
%%R
# install.packages('GGally', repos='http://cran.us.r-project.org')
library(GGally)
theme_set(theme_minimal(20))
#set the seed for reproducible results
set.seed(1836)
pf_subset <- pf[, c(2:15)]
names(pf_subset)
par(pin=c(12, 10))
ggpairs(pf_subset[sample.int(nrow(pf_subset), 1000), ], axisLabels = 'internal')
In [125]:
%R head(pf)
Out[125]:
In [139]:
#pandas
#set the seed for reproducible results
np.random.seed(1836)
pf_subset = pf.iloc[:,3:]
pf_subset.columns
Out[139]:
In [142]:
#generate correlation matrix
corr = pf_subset.iloc[np.random.choice(pf_subset.index, 1000),:].corr()
corr
Out[142]:
In [143]:
sns.pairplot(pf_subset.iloc[np.random.choice(pf_subset.index, 1000),:])
Out[143]:
In [149]:
sns.PairGrid(pf_subset.iloc[np.random.choice(pf_subset.index, 1000),:])
Out[149]:
In [145]:
#heatmap with only the positive values
sns.heatmap(corr.abs())
Out[145]:
In [ ]: