In [4]:
# %load /home/jkb/Data-Science/setup/eda_jupyter_setup.py
'''
EDA setup for Jupyter
'''
import rpy2.interactive
import rpy2.interactive.packages
%load_ext rpy2.ipython
# Directly convert objects from pandas to r and vsv
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn-ticks')
#Set plottng parameters
plt.rcParams['savefig.dpi'] = 200
plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14
plt.rcParams['text.usetex'] = False # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
# plt.rcParams['text.latex.preamble'] = "\usepackage{subdepth}, \usepackage{type1cm}"
#clear warning
import warnings
warnings.filterwarnings('ignore')
#import plotly
import plotly.tools as tls
import cufflinks as cf
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)
#import ggplot
from ggplot import *
# Load R libraries
%R library(ggplot2)
%R library(dplyr)
%R library(gridExtra)
%R library(plotly)
%R library(GGally)
Out[4]:
In this problem set, you'll continue to explore the diamonds data set.
In [5]:
%%R
# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
data(diamonds)
str(diamonds)
In [6]:
%%R
ggplot(aes(y=price, x=x), data = diamonds) +
geom_point()
In [13]:
diamonds = r.diamonds
diamonds.head()
Out[13]:
In [ ]:
# pass it with the -o switch directly from R
In [12]:
%R -o diamonds
In [11]:
#it is not a full pandas dataframe
diamonds.head()
Out[11]:
In [9]:
diamonds.plot.scatter(y= 'price', x ='x');
In [14]:
help(diamonds)
In [15]:
# Will have to convert the R object explicitily to a dataframe
diamonds = pandas2ri.ri2py_dataframe(r.diamonds)
diamonds.head()
Out[15]:
In [16]:
diamonds.plot.scatter(y= 'price', x ='x');
In [8]:
%%capture a
%%R
with(diamonds, cor.test(price, x))
In [9]:
a.outputs[0]
Out[9]:
In [10]:
str(a)
Out[10]:
In [11]:
a.stdout
Out[11]:
In [12]:
a.show()
In [13]:
b = a.show()
In [14]:
type(b)
Out[14]:
In [15]:
%%capture b
a.show()
In [16]:
b.outputs
Out[16]:
I can not get it with capture. Maybe if I rpy interface
In [17]:
a
Out[17]:
In [18]:
%R a= with(diamonds, cor.test(price, y)) -o a
Out[18]:
In [19]:
a.index
Out[19]:
In [20]:
%%R
with(diamonds, cor.test(price, z))
In [21]:
# Ideal for a nice little python script
for variable in('x', 'y', 'z'):
print('Correlation of Price with {}: {:.3f}'.format(variable, diamonds.price.corr(diamonds[variable])))
In [17]:
%%R
ggplot(aes(y=price, x=depth), data = diamonds) +
geom_point()
In [23]:
diamonds.plot.scatter(y= 'price', x ='depth');
In [24]:
%%R
ggplot(aes(y=price, x=depth), data = diamonds) +
geom_point(alpha=1/100) +
scale_x_continuous(breaks = seq(43,79,2))
In [25]:
diamonds.plot.scatter(y= 'price', x ='depth', alpha=1/100)
# plt.xlim(43,79)
x=diamonds.depth
plt.xticks(np.arange(min(x), max(x)+1, 2.0));
Based on the scatterplot most diamonds are within what range?
Between 60 and 63
In [26]:
%%R
summary(diamonds$depth)
In [27]:
diamonds.depth.describe()
Out[27]:
In [28]:
sns.jointplot(y= 'price', x ='depth', data=diamonds, alpha=1/100);
In [29]:
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='reg');
In [30]:
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='kde');
In [31]:
#Let's take a closer look
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='kde', xlim=(0,2000), ylim=(60,65));
In [32]:
#Let's take a closer look
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='scatter', xlim=(0,2000), ylim=(60,65), alpha=1/50);
In [33]:
%%R
#First the plain one
ggplot(aes(x=carat, y=price), data = diamonds) +
geom_point(alpha=1/100)
#xlim(0, quantile(diamonds$carat, 0.90)) +
#ylim(0, quantile(diamonds$carat, 0.90))
In [34]:
%%R
ggplot(aes(x=carat, y=price), data = diamonds) +
geom_point(alpha=1/100) +
xlim(0, quantile(diamonds$carat, 0.99)) +
ylim(0, quantile(diamonds$price, 0.99))
In [35]:
sns.jointplot(x= 'carat', y ='price', data=diamonds, kind='scatter',
xlim=(0,diamonds.carat.quantile(.99)),
ylim=(0,diamonds.price.quantile(.99)),
alpha=1/100, size=7);
This is cool but I will try now to recreate the scatter plot without the marginal histograms and with a regression line
In [36]:
from scipy import stats
In [37]:
stats.pearsonr(diamonds.price, diamonds.carat)
Out[37]:
In [38]:
# This works butt dos not accept the pearson annotation
fig, ax = plt.subplots(1, 1, figsize=(10, 10))
g = sns.regplot(x= 'carat', y ='price', data=diamonds, ax=ax,
scatter_kws={'alpha':1/100},
line_kws={'color':'red'})
g.set(xlim=(0,diamonds.carat.quantile(.99)),
ylim=(0,diamonds.price.quantile(.99)));
In [39]:
g = sns.JointGrid(x= 'carat', y ='price', data=diamonds, ratio=100,
xlim=(0,diamonds.carat.quantile(.99)),
ylim=(0,diamonds.price.quantile(.99)))
g.plot_joint(sns.regplot,
scatter_kws={'alpha':1/100},
fit_reg=False)
g.annotate(stats.pearsonr);
# g.ax_marg_x.set_axis_off()
# g.ax_marg_y.set_axis_off()
In [40]:
%%R
#create new variable
diamonds$volume <- diamonds$x * diamonds$y * diamonds$z
ggplot(aes(x=volume, y=price), data = diamonds) +
geom_point(alpha=1/100) +
xlim(0, quantile(diamonds$volume, 0.99)) +
ylim(0, quantile(diamonds$price, 0.99))
In [41]:
#let's get the update df from the r object ;)
diamonds = pandas2ri.ri2py_dataframe(r.diamonds)
g = sns.JointGrid(x= 'volume', y ='price', data=diamonds, ratio=100,
xlim=(0,diamonds.volume.quantile(.99)),
ylim=(0,diamonds.price.quantile(.99)))
g.plot_joint(sns.regplot,
scatter_kws={'alpha':1/100},
line_kws={'color':'red'})
g.annotate(stats.pearsonr);
In [42]:
%%R
ggplot(aes(x=volume, y=price), data = diamonds) +
geom_point()
In [43]:
diamonds.plot.scatter(x ='volume', y= 'price');
In [44]:
# Number of outliers with zero volume
diamonds[diamonds.volume==0].index.value_counts().sum()
Out[44]:
In [45]:
%%R
with(subset(diamonds, (diamonds$volume!=0 & diamonds$volume<800)), cor.test(volume, price))
In [46]:
diamonds_sub = diamonds[(diamonds.volume!=0)&( diamonds.volume<800)]
diamonds_sub.volume.corr(diamonds_sub.price)
Out[46]:
!!You need to set the limitations beforehand if you want to get the right pearson score
Subset the data to exclude diamonds with a volume greater than or equal to 800. Also, exclude diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot. (See the Instructor Notes or look up the documentation of geom_smooth() for more details about smoothers.)
In [47]:
%%R
ggplot(aes(x=volume, y=price), data = subset(diamonds, (diamonds$volume!=0 & diamonds$volume<800)))+
geom_point(alpha=1/100) +
geom_smooth(method = 'lm', color = 'red') +
xlim(0, quantile(diamonds$volume, 0.99)) +
ylim(0, quantile(diamonds$price, 0.99))
In [48]:
#Let's see how that looks in our graph
g = sns.JointGrid(x= 'volume', y ='price', data=diamonds_sub, ratio=100,
xlim=(0,diamonds.volume.quantile(.99)),
ylim=(0,diamonds.price.quantile(.99)))
g.plot_joint(sns.regplot,
scatter_kws={'alpha':1/100},
line_kws={'color':'red'})
g.annotate(stats.pearsonr)
g.ax_marg_x.set_axis_off()
g.ax_marg_y.set_axis_off();
Use the function dplyr package to create a new data frame containing info on diamonds by clarity.
Name the data frame diamondsByClarity
The data frame should contain the following variables in this order.
(1) mean_price
(2) median_price
(3) min_price
(4) max_price
(5) n
where n is the number of diamonds in each level of clarity.
In [49]:
%%R
diamondsByClarity_grouped <- group_by(diamonds, clarity)
diamondsByClarity <- summarise(diamondsByClarity_grouped,
mean_price = mean(price),
median_price = median(price),
min_price = min(price),
max_price = max(price),
n = n())
head(diamondsByClarity)
In [50]:
#Pandas
diamondsByClarity_grouped = diamonds.groupby('clarity')
diamondsByClarity = diamondsByClarity_grouped.price.aggregate([np.mean, np.median, np.min, np.max, len])
diamondsByClarity.head()
Out[50]:
We’ve created summary data frames with the mean price by clarity and color. You can run the code in R to verify what data is in the variables diamonds_mp_by_clarity and diamonds_mp_by_color.
Your task is to write additional code to create two bar plots on one output image using the grid.arrange() function from the package gridExtra.
In [ ]:
%%R
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))
diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))
p1 <- ggplot(aes(x=color, y=mean_price), data = diamonds_mp_by_color) +
geom_bar(stat = 'identity')
p2 <- ggplot(aes(x=clarity, y=mean_price), data = diamonds_mp_by_clarity) +
geom_bar(stat = 'identity')
grid.arrange(p1,p2, ncol=2)
In [17]:
diamonds_by_clarity = diamonds.groupby('clarity')
diamonds_mp_by_clarity = diamonds_by_clarity.aggregate(np.mean)
diamonds_by_color = diamonds.groupby('color')
diamonds_mp_by_color = diamonds_by_color.aggregate(np.mean)
In [26]:
fig, axes = plt.subplots(1,2)
diamonds_mp_by_color.price.plot(kind='bar',width=0.8, ax=axes[0])
diamonds_mp_by_clarity.price.plot(kind='bar',width=0.85, ax=axes[1]);
Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.
The plot should look something like this. http://i.imgur.com/b5xyrOu.jpg
Note: In the link, a color palette of type 'qual' was used to color the histogram using scale_fill_brewer(type = 'qual')
This assignment is not graded and will be marked as correct when you submit.
In [18]:
%%R
ggplot(aes(x=price), data=diamonds) +
facet_wrap(~color) +
geom_histogram(aes(color=cut)) +
scale_fill_brewer(type = 'qual')
In [41]:
#pandas-seaborn -setting the alpha was the trick here
g = sns.FacetGrid(diamonds, hue='cut',col='color', col_wrap=3)
g = g.map(plt.hist, 'price', bins=30, alpha=0.6).add_legend();
In [46]:
#pandas-ggplot
#I had to cast it as category to make it work
diamonds.cut = diamonds.cut.astype("category")
ggplot(aes(x='price', color='cut'), data=diamonds) +\
facet_wrap('color') + \
geom_histogram(bins=30, alpha=0.6) +\
scale_color_brewer(type='qual')
Out[46]:
In [38]:
In [ ]: