In [4]:

    
# %load /home/jkb/Data-Science/setup/eda_jupyter_setup.py
'''
EDA setup for Jupyter
'''

import rpy2.interactive 
import rpy2.interactive.packages
%load_ext rpy2.ipython

# Directly convert objects from pandas to r and vsv
from rpy2.robjects import r, pandas2ri
pandas2ri.activate()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.style.use('seaborn-ticks')

#Set plottng parameters
plt.rcParams['savefig.dpi'] = 200

plt.rcParams['figure.autolayout'] = False
plt.rcParams['figure.figsize'] = 18, 10
plt.rcParams['axes.labelsize'] = 18
plt.rcParams['axes.titlesize'] = 20
plt.rcParams['font.size'] = 16
plt.rcParams['lines.linewidth'] = 2.0
plt.rcParams['lines.markersize'] = 8
plt.rcParams['legend.fontsize'] = 14

plt.rcParams['text.usetex'] = False # True activates latex output in fonts!
plt.rcParams['font.family'] = "serif"
plt.rcParams['font.serif'] = "cm"
# plt.rcParams['text.latex.preamble'] = "\usepackage{subdepth}, \usepackage{type1cm}"

#clear warning
import warnings
warnings.filterwarnings('ignore')


#import plotly 
import plotly.tools as tls
import cufflinks as cf
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

#import ggplot
from ggplot import *

# Load R libraries
%R library(ggplot2)
%R library(dplyr)
%R library(gridExtra)
%R library(plotly)
%R library(GGally)









    











    











    Out[4]:





array(['GGally', 'plotly', 'gridExtra', 'dplyr', 'ggplot2', 'compiler',
       'tools', 'stats', 'graphics', 'grDevices', 'utils', 'datasets',
       'methods', 'base'],
      dtype='<U9')

Explore two variables: Problem Set

In this problem set, you'll continue to explore the diamonds data set.

price vs. x



In [5]:

    
%%R
# Your first task is to create a
# scatterplot of price vs x.
# using the ggplot syntax.
data(diamonds)
str(diamonds)









    





Classes ‘tbl_df’, ‘tbl’ and 'data.frame':	53940 obs. of  10 variables:
 $ carat  : num  0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
 $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
 $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
 $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
 $ depth  : num  61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
 $ table  : num  55 61 65 58 58 57 57 55 61 61 ...
 $ price  : int  326 326 327 334 335 336 336 337 337 338 ...
 $ x      : num  3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
 $ y      : num  3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
 $ z      : num  2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...



In [6]:

    
%%R
ggplot(aes(y=price, x=x), data = diamonds) + 
  geom_point()



In [13]:

    
diamonds = r.diamonds
diamonds.head()









    Out[13]:





R object with classes: ('tbl_df', 'tbl', 'data.frame') mapped to:
<DataFrame - Python:0x7f3798156948 / R:0x8049a78>
[Float..., Facto..., Facto..., ..., Float..., Float..., Float...]
  carat: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f379acf0f08 / R:0xb15e078>
[0.230000, 0.210000, 0.230000, 0.290000, 0.310000, 0.240000]
  cut: <class 'rpy2.robjects.vectors.FactorVector'>
  R object with classes: ('ordered', 'factor') mapped to:
<FactorVector - Python:0x7f379814a288 / R:0x804c6a0>
[       4,        3,        1,        3,        1,        2]
  color: <class 'rpy2.robjects.vectors.FactorVector'>
  R object with classes: ('ordered', 'factor') mapped to:
<FactorVector - Python:0x7f379814a048 / R:0x804dcc0>
[       1,        1,        1,        2,        3,        3]
  ...
  carat: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f379814a0c8 / R:0xb15df40>
[3.950000, 3.890000, 4.050000, 4.200000, 4.340000, 3.940000]
  cut: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f379ad13ec8 / R:0xb15ded8>
[3.980000, 3.840000, 4.070000, 4.230000, 4.350000, 3.960000]
  color: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f379ad13748 / R:0xb15de70>
[2.430000, 2.310000, 2.310000, 2.630000, 2.750000, 2.480000]



In [ ]:

    
# pass it with the -o switch directly from R



In [12]:

    
%R -o diamonds



In [11]:

    
#it is not a full pandas dataframe
diamonds.head()









    Out[11]:





    R/rpy2 DataFrame (6 x 10)
    
      
        
        
          carat
        
          cut
        
          color
        
          ...
        
          x
        
          y
        
          z
        
        
      
      
      
      
      
      
        0.230000
      
      
      
        Ideal
      
      
      
        E
      
      
      
        ...
      
      
      
        3.950000
      
      
      
        3.980000
      
      
      
        2.430000
      
      
      
      
      
      
      
        0.210000
      
      
      
        Premium
      
      
      
        E
      
      
      
        ...
      
      
      
        3.890000
      
      
      
        3.840000
      
      
      
        2.310000
      
      
      
      
      
      
      
        0.230000
      
      
      
        Good
      
      
      
        E
      
      
      
        ...
      
      
      
        4.050000
      
      
      
        4.070000
      
      
      
        2.310000
      
      
      
      
      
      
      
        0.290000
      
      
      
        Premium
      
      
      
        I
      
      
      
        ...
      
      
      
        4.200000
      
      
      
        4.230000
      
      
      
        2.630000
      
      
      
      
      
      
      
        0.310000
      
      
      
        Good
      
      
      
        J
      
      
      
        ...
      
      
      
        4.340000
      
      
      
        4.350000
      
      
      
        2.750000
      
      
      
      
      
      
      
        0.240000
      
      
      
        Very ...
      
      
      
        J
      
      
      
        ...
      
      
      
        3.940000
      
      
      
        3.960000
      
      
      
        2.480000



In [9]:

    
diamonds.plot.scatter(y= 'price', x ='x');









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-4491bab912b2> in <module>()
----> 1 diamonds.plot.scatter(y= 'price', x ='x');

AttributeError: 'DataFrame' object has no attribute 'plot'



In [14]:

    
help(diamonds)









    



Help on DataFrame in module rpy2.robjects.vectors object:

class DataFrame(ListVector)
 |  R 'data.frame'.
 |  
 |  Method resolution order:
 |      DataFrame
 |      ListVector
 |      Vector
 |      rpy2.robjects.robject.RObjectMixin
 |      rpy2.rinterface.ListSexpVector
 |      rpy2.rinterface.SexpVector
 |      rpy2.rinterface.Sexp
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __getitem__(self, i)
 |      Return self[key].
 |  
 |  __init__(self, obj, stringsasfactor=False)
 |      Create a new data frame.
 |      
 |      :param obj: object inheriting from rpy2.rinterface.SexpVector,
 |                  or inheriting from TaggedList
 |                  or a mapping name -> value
 |      :param stringsasfactors: Boolean indicating whether vectors
 |                  of strings should be turned to vectors. Note
 |                  that factors will not be turned to string vectors.
 |  
 |  cbind(self, *args, **kwargs)
 |      bind objects as supplementary columns
 |  
 |  head(self, *args, **kwargs)
 |      Call the R generic 'head()'.
 |  
 |  iter_column(self)
 |      iterator across columns
 |  
 |  iter_row(self)
 |      iterator across rows
 |  
 |  rbind(self, *args, **kwargs)
 |      bind objects as supplementary rows
 |  
 |  to_csvfile(self, path, quote=True, sep=',', eol='\n', na='NA', dec='.', row_names=True, col_names=True, qmethod='escape', append=False)
 |      Save the data into a .csv file. 
 |      
 |      path         : string with a path 
 |      quote        : quote character
 |      sep          : separator character
 |      eol          : end-of-line character(s)
 |      na           : string for missing values
 |      dec          : string for decimal separator
 |      row_names    : boolean (save row names, or not)
 |      col_names    : boolean (save column names, or not)
 |      comment_char : method to 'escape' special characters
 |      append       : boolean (append if the file in the path is already existing, or not)
 |  
 |  ----------------------------------------------------------------------
 |  Static methods defined here:
 |  
 |  from_csvfile(path, header=True, sep=',', quote='"', dec='.', row_names=rpy2.rinterface.MissingArg, col_names=rpy2.rinterface.MissingArg, fill=True, comment_char='', na_strings=[], as_is=False)
 |      Create an instance from data in a .csv file. 
 |      
 |      path         : string with a path 
 |      header       : boolean (heading line with column names or not)
 |      sep          : separator character
 |      quote        : quote character
 |      row_names    : column name, or column index for column names (warning: indexing starts at one in R)
 |      fill         : boolean (fill the lines when less entries than columns)
 |      comment_char : comment character
 |      na_strings   : a list of strings which are interpreted to be NA values
 |      as_is        : boolean (keep the columns of strings as such, or turn them into factors)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  colnames
 |  
 |  ncol
 |      Number of columns.
 |      :rtype: integer
 |  
 |  nrow
 |      Number of rows. 
 |      :rtype: integer
 |  
 |  rownames
 |      Row names
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from ListVector:
 |  
 |  __repr__(self)
 |      Return repr(self).
 |  
 |  ----------------------------------------------------------------------
 |  Static methods inherited from ListVector:
 |  
 |  from_length(length)
 |      Create a list of given length
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from Vector:
 |  
 |  __add__(self, x)
 |  
 |  __getslice__(self, i, j)
 |  
 |  __repr_content__(self)
 |  
 |  __setitem__(self, i, value)
 |      Set self[key] to value.
 |  
 |  items(self)
 |      iterator on names and values
 |  
 |  repr_format_elt(self, elt, max_width=12)
 |  
 |  sample(self, n, replace=False, probabilities=None)
 |      Draw a sample of size n from the vector. 
 |      If 'replace' is True, the sampling is done with replacement.
 |      The optional argument 'probabilities' can indicate sampling probabilities.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from Vector:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)
 |  
 |  names
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from rpy2.robjects.robject.RObjectMixin:
 |  
 |  __reduce__(self)
 |      robjects-level `__reduce__()`, calling the parent class'
 |      `__reduce__()` before substituting the current high-level
 |      class as a constructor.
 |  
 |  __str__(self)
 |      Return str(self).
 |  
 |  r_repr(self)
 |      String representation for an object that can be
 |      directly evaluated as R code.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from rpy2.robjects.robject.RObjectMixin:
 |  
 |  rclass
 |      R class for the object, stored as an R string vector.
 |      
 |      When setting the rclass, the new value will be:
 |      - wrapped in a Python tuple if a string (the R class
 |        is a vector of strings, and this is made for convenience)
 |      - wrapped in a StrSexpVector
 |      Note that when setting the class R may make a copy of
 |      the whole object (R is mostly a functional language).
 |      If this must be avoided, and if the number of parent
 |      classes before and after the change are compatible,
 |      the class name can be changed in-place by replacing "
 |      vector elements.
 |  
 |  slots
 |      Attributes of the underlying R object as a Python mapping.
 |      
 |      The attributes can accessed and assigned by name (as if they
 |      were in a Python `dict`).
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes inherited from rpy2.robjects.robject.RObjectMixin:
 |  
 |  __rname__ = None
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from rpy2.rinterface.SexpVector:
 |  
 |  __delitem__(self, key, /)
 |      Delete self[key].
 |  
 |  __len__(self, /)
 |      Return len(self).
 |  
 |  index(...)
 |      V.index(value, [start, [stop]]) -> integer -- return first index of value.Raises ValueError if the value is not present.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from rpy2.rinterface.SexpVector:
 |  
 |  __array_struct__
 |      Array protocol: struct
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from rpy2.rinterface.Sexp:
 |  
 |  __deepcopy__(...)
 |      Makes a copy of the underlying Sexp object, and returns it.
 |  
 |  __getstate__(...)
 |      Returns a serialized object for the underlying R object
 |  
 |  __new__(*args, **kwargs) from builtins.type
 |      Create and return a new object.  See help(type) for accurate signature.
 |  
 |  __setstate__(...)
 |      set the state of an instance (dummy).
 |  
 |  do_slot(...)
 |      Returns the attribute/slot for an R object.
 |       The name of the slot (a string) is the only parameter for
 |      the method.
 |      :param name: string
 |      :rtype: instance of type or subtype :class:`rpy2.rinterface.Sexp`
 |  
 |  do_slot_assign(...)
 |      Set the attribute/slot for an R object.
 |      
 |      :param name: string
 |      :param value: instance of :class:`rpy2.rinterface.Sexp`
 |  
 |  list_attrs(...)
 |      Returns the list of attribute names.
 |  
 |  rsame(...)
 |      Is the given object representing the same underlying R object as the instance.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from rpy2.rinterface.Sexp:
 |  
 |  __sexp__
 |      Opaque C pointer to the underlying R object
 |  
 |  __sexp_refcount__
 |      Reference counter for the underlying R object
 |  
 |  named
 |      Integer code for the R object reference-pseudo counting.
 |      This method corresponds to the macro NAMED.
 |      See the R-extensions manual for further details.
 |  
 |  rid
 |      ID for the associated R object (Hint: that's a memory address)
 |  
 |  typeof
 |      R internal SEXPREC type.



In [15]:

    
# Will have to convert the R object explicitily to a dataframe
diamonds = pandas2ri.ri2py_dataframe(r.diamonds)
diamonds.head()



In [16]:

    
diamonds.plot.scatter(y= 'price', x ='x');

Observations

There seems to exist a strong positive correlation between rice and the x variable
The number of observations seem to be positively correlated as well for both variables.

From the Answer

Outliers
Exponential relationship!

Correlations

Which are the correlations between price and x, y and z

Checking to see how I can capture an use R output



In [8]:

    
%%capture a
%%R 
with(diamonds, cor.test(price, x))



In [9]:

    
a.outputs[0]









    Out[9]:





<IPython.utils.capture.RichOutput at 0x7f90a0f46b70>



In [10]:

    
str(a)









    Out[10]:





''



In [11]:

    
a.stdout









    Out[11]:





''



In [12]:

    
a.show()









    





	Pearson's product-moment correlation

data:  price and x
t = 440.16, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8825835 0.8862594
sample estimates:
      cor 
0.8844352



In [13]:

    
b = a.show()









    





	Pearson's product-moment correlation

data:  price and x
t = 440.16, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8825835 0.8862594
sample estimates:
      cor 
0.8844352



In [14]:

    
type(b)









    Out[14]:





NoneType



In [15]:

    
%%capture b
a.show()



In [16]:

    
b.outputs









    Out[16]:





[<IPython.utils.capture.RichOutput at 0x7f90a0f461d0>]

I can not get it with capture. Maybe if I rpy interface



In [17]:

    
a









    Out[17]:





<IPython.utils.capture.CapturedIO at 0x7f90a0f46898>



In [18]:

    
%R a= with(diamonds, cor.test(price, y)) -o a









    Out[18]:





R object with classes: ('htest',) mapped to:
<ListVector - Python:0x7f9065456b08 / R:0x8e97a80>
[Float..., IntVe..., Float..., ..., StrVe..., StrVe..., Float...]
  statistic: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f9065969b88 / R:0xae906b8>
[401.141474]
  parameter: <class 'rpy2.robjects.vectors.IntVector'>
  R object with classes: ('integer',) mapped to:
<IntVector - Python:0x7f906591a8c8 / R:0xae90628>
[   53938]
  p.value: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f906591ad08 / R:0xae89528>
[0.000000]
  ...
  statistic: <class 'rpy2.robjects.vectors.StrVector'>
  R object with classes: ('character',) mapped to:
<StrVector - Python:0x7f906591ae48 / R:0x506aab8>
["Pearson's product-moment correlation"]
  parameter: <class 'rpy2.robjects.vectors.StrVector'>
  R object with classes: ('character',) mapped to:
<StrVector - Python:0x7f906591ad88 / R:0xae950a8>
['price and y']
  p.value: <class 'rpy2.robjects.vectors.FloatVector'>
  R object with classes: ('numeric',) mapped to:
<FloatVector - Python:0x7f9065918248 / R:0x9bc25f0>
[0.863287, 0.867524]



In [19]:

    
a.index









    Out[19]:





<function ListVector.index>



In [20]:

    
%%R
with(diamonds, cor.test(price, z))









    





	Pearson's product-moment correlation

data:  price and z
t = 393.6, df = 53938, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8590541 0.8634131
sample estimates:
      cor 
0.8612494



In [21]:

    
# Ideal for a nice little python script
for variable in('x', 'y', 'z'):
    print('Correlation of Price with  {}: {:.3f}'.format(variable, diamonds.price.corr(diamonds[variable])))









    



Correlation of Price with  x: 0.884
Correlation of Price with  y: 0.865
Correlation of Price with  z: 0.861

price vs. depth



In [17]:

    
%%R
ggplot(aes(y=price, x=depth), data = diamonds) + 
  geom_point()



In [23]:

    
diamonds.plot.scatter(y= 'price', x ='depth');

Adjustments - price vs. depth

Change the code to make the transparency of the points to be 1/100 of what they are now and mark the x-axis every 2 units. See the instructor notes for two hints.



In [24]:

    
%%R
ggplot(aes(y=price, x=depth), data = diamonds) + 
  geom_point(alpha=1/100) +
  scale_x_continuous(breaks = seq(43,79,2))



In [25]:

    
diamonds.plot.scatter(y= 'price', x ='depth', alpha=1/100)
# plt.xlim(43,79)
x=diamonds.depth
plt.xticks(np.arange(min(x), max(x)+1, 2.0));

Typical Depth Range

Based on the scatterplot most diamonds are within what range?

Between 60 and 63



In [26]:

    
%%R
summary(diamonds$depth)









    





   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  43.00   61.00   61.80   61.75   62.50   79.00



In [27]:

    
diamonds.depth.describe()









    Out[27]:





count    53940.000000
mean        61.749405
std          1.432621
min         43.000000
25%         61.000000
50%         61.800000
75%         62.500000
max         79.000000
Name: depth, dtype: float64



In [28]:

    
sns.jointplot(y= 'price', x ='depth', data=diamonds, alpha=1/100);



In [29]:

    
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='reg');



In [30]:

    
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='kde');



In [31]:

    
#Let's take a closer look
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='kde', xlim=(0,2000), ylim=(60,65));



In [32]:

    
#Let's take a closer look
sns.jointplot(x= 'price', y ='depth', data=diamonds, kind='scatter', xlim=(0,2000), ylim=(60,65), alpha=1/50);

price vs carat

Create a scatterplot of price vs carat and omit the top 1% of price and carat values.



In [33]:

    
%%R
#First the plain one
ggplot(aes(x=carat, y=price), data = diamonds) + 
  geom_point(alpha=1/100)  
    #xlim(0, quantile(diamonds$carat, 0.90)) + 
    #ylim(0, quantile(diamonds$carat, 0.90))



In [34]:

    
%%R
ggplot(aes(x=carat, y=price), data = diamonds) + 
  geom_point(alpha=1/100)  +
    xlim(0, quantile(diamonds$carat, 0.99)) + 
    ylim(0, quantile(diamonds$price, 0.99))



In [35]:

    
sns.jointplot(x= 'carat', y ='price', data=diamonds, kind='scatter',
              xlim=(0,diamonds.carat.quantile(.99)),
              ylim=(0,diamonds.price.quantile(.99)),
              alpha=1/100, size=7);

This is cool but I will try now to recreate the scatter plot without the marginal histograms and with a regression line



In [36]:

    
from scipy import stats



In [37]:

    
stats.pearsonr(diamonds.price, diamonds.carat)









    Out[37]:





(0.92159130119347676, 0.0)



In [38]:

    
# This works butt dos not accept the pearson annotation
fig, ax = plt.subplots(1, 1, figsize=(10, 10))

g  = sns.regplot(x= 'carat', y ='price', data=diamonds, ax=ax,
                 scatter_kws={'alpha':1/100},
                 line_kws={'color':'red'})
g.set(xlim=(0,diamonds.carat.quantile(.99)),
      ylim=(0,diamonds.price.quantile(.99)));



In [39]:

    
g = sns.JointGrid(x= 'carat', y ='price', data=diamonds, ratio=100,
                 xlim=(0,diamonds.carat.quantile(.99)),
                 ylim=(0,diamonds.price.quantile(.99)))
                 
g.plot_joint(sns.regplot,
             scatter_kws={'alpha':1/100},
             fit_reg=False)
g.annotate(stats.pearsonr);
# g.ax_marg_x.set_axis_off()
# g.ax_marg_y.set_axis_off()

price vs volume

Create a scatterplot of price vs. volume (x y z). This is a very rough approximation for a diamond's volume.

Create a new variable for volume in the diamonds data frame. This will be useful in a later exercise.



In [40]:

    
%%R
#create new variable
diamonds$volume <- diamonds$x * diamonds$y * diamonds$z

ggplot(aes(x=volume, y=price), data = diamonds) + 
  geom_point(alpha=1/100)  +
    xlim(0, quantile(diamonds$volume, 0.99)) + 
    ylim(0, quantile(diamonds$price, 0.99))



In [41]:

    
#let's get the update df from the r object ;)
diamonds = pandas2ri.ri2py_dataframe(r.diamonds)

g = sns.JointGrid(x= 'volume', y ='price', data=diamonds, ratio=100,
                 xlim=(0,diamonds.volume.quantile(.99)),
                 ylim=(0,diamonds.price.quantile(.99)))
                 
g.plot_joint(sns.regplot,
             scatter_kws={'alpha':1/100},
             line_kws={'color':'red'})
g.annotate(stats.pearsonr);

Observations

Strong positive correlation
Most diamonds have a price below 8000 and volume below 200

Outliers



In [42]:

    
%%R
ggplot(aes(x=volume, y=price), data = diamonds) +
  geom_point()



In [43]:

    
diamonds.plot.scatter(x ='volume', y= 'price');



In [44]:

    
# Number of outliers with zero volume
diamonds[diamonds.volume==0].index.value_counts().sum()









    Out[44]:





20

Correlation price vs volume

Exclude diamonds with a volume o or greater than 800



In [45]:

    
%%R
with(subset(diamonds, (diamonds$volume!=0 & diamonds$volume<800)), cor.test(volume, price))









    





	Pearson's product-moment correlation

data:  volume and price
t = 559.19, df = 53915, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.9222944 0.9247772
sample estimates:
      cor 
0.9235455



In [46]:

    
diamonds_sub = diamonds[(diamonds.volume!=0)&( diamonds.volume<800)]
diamonds_sub.volume.corr(diamonds_sub.price)









    Out[46]:





0.92354550185956141

!!You need to set the limitations beforehand if you want to get the right pearson score

Adjustments

Subset the data to exclude diamonds with a volume greater than or equal to 800. Also, exclude diamonds with a volume of 0. Adjust the transparency of the points and add a linear model to the plot. (See the Instructor Notes or look up the documentation of geom_smooth() for more details about smoothers.)



In [47]:

    
%%R
ggplot(aes(x=volume, y=price), data = subset(diamonds, (diamonds$volume!=0 & diamonds$volume<800)))+
         geom_point(alpha=1/100)  +
  geom_smooth(method = 'lm', color = 'red') + 
    xlim(0, quantile(diamonds$volume, 0.99)) + 
    ylim(0, quantile(diamonds$price, 0.99))



In [48]:

    
#Let's see how that looks in our graph
g = sns.JointGrid(x= 'volume', y ='price', data=diamonds_sub, ratio=100,
                  xlim=(0,diamonds.volume.quantile(.99)),
                  ylim=(0,diamonds.price.quantile(.99)))
                 
g.plot_joint(sns.regplot,
             scatter_kws={'alpha':1/100},
             line_kws={'color':'red'})
g.annotate(stats.pearsonr)
g.ax_marg_x.set_axis_off()
g.ax_marg_y.set_axis_off();

Mean price by clarity

Use the function dplyr package to create a new data frame containing info on diamonds by clarity.

Name the data frame diamondsByClarity

The data frame should contain the following variables in this order.

    (1) mean_price
    (2) median_price
    (3) min_price
    (4) max_price
    (5) n

where n is the number of diamonds in each level of clarity.



In [49]:

    
%%R
diamondsByClarity_grouped <- group_by(diamonds, clarity)

diamondsByClarity <- summarise(diamondsByClarity_grouped,
                               mean_price = mean(price),
                               median_price = median(price),
                               min_price = min(price),
                               max_price = max(price),
                               n = n())

head(diamondsByClarity)









    





# A tibble: 6 × 6
  clarity mean_price median_price min_price max_price     n
    <ord>      <dbl>        <dbl>     <int>     <int> <int>
1      I1   3924.169         3344       345     18531   741
2     SI2   5063.029         4072       326     18804  9194
3     SI1   3996.001         2822       326     18818 13065
4     VS2   3924.989         2054       334     18823 12258
5     VS1   3839.455         2005       327     18795  8171
6    VVS2   3283.737         1311       336     18768  5066



In [50]:

    
#Pandas
diamondsByClarity_grouped = diamonds.groupby('clarity')

diamondsByClarity = diamondsByClarity_grouped.price.aggregate([np.mean, np.median, np.min, np.max, len])

diamondsByClarity.head()









    Out[50]:






  
    
      
      mean
      median
      amin
      amax
      len
    
    
      clarity
      
      
      
      
      
    
  
  
    
      1
      3924.168691
      3344
      345
      18531
      741
    
    
      2
      5063.028606
      4072
      326
      18804
      9194
    
    
      3
      3996.001148
      2822
      326
      18818
      13065
    
    
      4
      3924.989395
      2054
      334
      18823
      12258
    
    
      5
      3839.455391
      2005
      327
      18795
      8171

Bar Charts of Mean Price

We’ve created summary data frames with the mean price by clarity and color. You can run the code in R to verify what data is in the variables diamonds_mp_by_clarity and diamonds_mp_by_color.

Your task is to write additional code to create two bar plots on one output image using the grid.arrange() function from the package gridExtra.



In [ ]:

    
%%R
diamonds_by_clarity <- group_by(diamonds, clarity)
diamonds_mp_by_clarity <- summarise(diamonds_by_clarity, mean_price = mean(price))

diamonds_by_color <- group_by(diamonds, color)
diamonds_mp_by_color <- summarise(diamonds_by_color, mean_price = mean(price))



p1 <- ggplot(aes(x=color, y=mean_price), data = diamonds_mp_by_color) + 
      geom_bar(stat = 'identity')
p2 <- ggplot(aes(x=clarity, y=mean_price), data = diamonds_mp_by_clarity) + 
  geom_bar(stat = 'identity')
grid.arrange(p1,p2, ncol=2)



In [17]:

    
diamonds_by_clarity = diamonds.groupby('clarity')
diamonds_mp_by_clarity = diamonds_by_clarity.aggregate(np.mean)

diamonds_by_color = diamonds.groupby('color')
diamonds_mp_by_color = diamonds_by_color.aggregate(np.mean)



In [26]:

    
fig, axes = plt.subplots(1,2)

diamonds_mp_by_color.price.plot(kind='bar',width=0.8, ax=axes[0])
diamonds_mp_by_clarity.price.plot(kind='bar',width=0.85, ax=axes[1]);

What do you notice in each of the bar charts?

They have an opposite trend - mean_price rises with the color and it decreases with clarity
Clarity of 2 breaks the trend and has very high price

Explore Two Variables: Problem Set

Create a histogram of diamond prices. Facet the histogram by diamond color and use cut to color the histogram bars.

The plot should look something like this. http://i.imgur.com/b5xyrOu.jpg

Note: In the link, a color palette of type 'qual' was used to color the histogram using scale_fill_brewer(type = 'qual')

This assignment is not graded and will be marked as correct when you submit.



In [18]:

    
%%R
ggplot(aes(x=price), data=diamonds) + 
  facet_wrap(~color) +
  geom_histogram(aes(color=cut)) +
  scale_fill_brewer(type = 'qual')



In [41]:

    
#pandas-seaborn -setting the alpha was the trick here
g = sns.FacetGrid(diamonds, hue='cut',col='color', col_wrap=3)
g = g.map(plt.hist, 'price', bins=30, alpha=0.6).add_legend();



In [46]:

    
#pandas-ggplot

#I had to cast it as category to make it work
diamonds.cut = diamonds.cut.astype("category")

ggplot(aes(x='price', color='cut'), data=diamonds) +\
        facet_wrap('color') + \
        geom_histogram(bins=30, alpha=0.6) +\
        scale_color_brewer(type='qual')









    












    Out[46]:





<ggplot: (8754166600740)>



In [38]:



In [ ]:

carat	cut	color	...	x	y	z
0.230000	Ideal	E	...	3.950000	3.980000	2.430000
0.210000	Premium	E	...	3.890000	3.840000	2.310000
0.230000	Good	E	...	4.050000	4.070000	2.310000
0.290000	Premium	I	...	4.200000	4.230000	2.630000
0.310000	Good	J	...	4.340000	4.350000	2.750000
0.240000	Very ...	J	...	3.940000	3.960000	2.480000

	carat	cut	color	clarity	depth	table	price	x	y	z
1	0.23	5	2	2	61.5	55.0	326	3.95	3.98	2.43
2	0.21	4	2	3	59.8	61.0	326	3.89	3.84	2.31
3	0.23	2	2	5	56.9	65.0	327	4.05	4.07	2.31
4	0.29	4	6	4	62.4	58.0	334	4.20	4.23	2.63
5	0.31	2	7	2	63.3	58.0	335	4.34	4.35	2.75

	mean	median	amin	amax	len
clarity
1	3924.168691	3344	345	18531	741
2	5063.028606	4072	326	18804	9194
3	3996.001148	2822	326	18818	13065
4	3924.989395	2054	334	18823	12258
5	3839.455391	2005	327	18795	8171

Explore two variables: Problem Set

price vs. x

Observations

From the Answer

Correlations

Checking to see how I can capture an use R output

price vs. depth

Adjustments - price vs. depth

Typical Depth Range

price vs carat

price vs volume

Observations

Outliers

Correlation price vs volume

Adjustments

Mean price by clarity

Bar Charts of Mean Price

What do you notice in each of the bar charts?

Explore Two Variables: Problem Set

Price Histograms with Facet and Color