In [1]:
#### Introduction to Data Wrangling with Pandas ####
## Page 2 ##

In [2]:
#### Performing basic statistical operations ####

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
#so that we can view the graphs inside the notebook

In [4]:
s = pd.Series([1, 2, np.nan, 3], name="num")

In [5]:
s


Out[5]:
0    1.0
1    2.0
2    NaN
3    3.0
Name: num, dtype: float64

In [6]:
df1 = pd.DataFrame(s)

In [7]:
df1


Out[7]:
num
0 1.0
1 2.0
2 NaN
3 3.0

In [11]:
#If our age dataset is an year old
#df[age_now]= df[age]+1

In [8]:
df1['value'] = df1['num']*2
# internally for each value in column num perform each_value*2 and save it as the corresponding
# result in the value column
df1


Out[8]:
num value
0 1.0 2.0
1 2.0 4.0
2 NaN NaN
3 3.0 6.0

In [9]:
len(df1['num'])


Out[9]:
4

In [10]:
df1['num'].count()


Out[10]:
3

In [11]:
df1['num'].mean()


Out[11]:
2.0

In [12]:
df1['num'].sum()


Out[12]:
6.0

In [13]:
df1['num'].median()


Out[13]:
2.0

In [14]:
df1['num'].std()


Out[14]:
1.0

In [13]:
df1['num'].min()


Out[13]:
1.0

In [14]:
df1['num'].max()


Out[14]:
3.0

In [20]:
df1['num'].describe()


Out[20]:
count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
Name: num, dtype: float64

In [15]:
df1['num'].cumsum()


Out[15]:
0    1.0
1    3.0
2    NaN
3    6.0
Name: num, dtype: float64

In [21]:
df1.mean()


Out[21]:
num      2.0
value    4.0
dtype: float64

In [22]:
# When you give the whole dataframe, then all numerical columns will be analysis

In [23]:
df1.corr()


Out[23]:
num value
num 1.0 1.0
value 1.0 1.0

In [24]:
# Corr between num and value is 1 because value is the double of num

In [15]:
df1['num'].plot(kind='box')


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f25d067a0d0>

In [26]:
# The box is the interquartile range, redline-median, lower blue is 25% quartile and uppper blue is 75% quartile

In [16]:
df = pd.read_csv('~/diamonds-data/diamonds.csv', index_col='Unnamed: 0')

In [28]:
len(df)


Out[28]:
598024

In [17]:
df.head()


Out[17]:
carat cut color clarity table depth cert measurements price x y z
1 0.25 V.Good K I1 59.0 63.7 GIA 3.96 x 3.95 x 2.52 NaN 3.96 3.95 2.52
2 0.23 Good G I1 61.0 58.1 GIA 4.00 x 4.05 x 2.30 NaN 4.00 4.05 2.30
3 0.34 Good J I2 58.0 58.7 GIA 4.56 x 4.53 x 2.67 NaN 4.56 4.53 2.67
4 0.21 V.Good D I1 60.0 60.6 GIA 3.80 x 3.82 x 2.31 NaN 3.80 3.82 2.31
5 0.31 V.Good K I1 59.0 62.2 EGL 4.35 x 4.26 x 2.68 NaN 4.35 4.26 2.68

In [18]:
df = df.fillna(0).head()
# not an inplace filling


Out[18]:
carat cut color clarity table depth cert measurements price x y z
1 0.25 V.Good K I1 59.0 63.7 GIA 3.96 x 3.95 x 2.52 0.0 3.96 3.95 2.52
2 0.23 Good G I1 61.0 58.1 GIA 4.00 x 4.05 x 2.30 0.0 4.00 4.05 2.30
3 0.34 Good J I2 58.0 58.7 GIA 4.56 x 4.53 x 2.67 0.0 4.56 4.53 2.67
4 0.21 V.Good D I1 60.0 60.6 GIA 3.80 x 3.82 x 2.31 0.0 3.80 3.82 2.31
5 0.31 V.Good K I1 59.0 62.2 EGL 4.35 x 4.26 x 2.68 0.0 4.35 4.26 2.68

In [19]:
df.head()


Out[19]:
carat cut color clarity table depth cert measurements price x y z
1 0.25 V.Good K I1 59.0 63.7 GIA 3.96 x 3.95 x 2.52 NaN 3.96 3.95 2.52
2 0.23 Good G I1 61.0 58.1 GIA 4.00 x 4.05 x 2.30 NaN 4.00 4.05 2.30
3 0.34 Good J I2 58.0 58.7 GIA 4.56 x 4.53 x 2.67 NaN 4.56 4.53 2.67
4 0.21 V.Good D I1 60.0 60.6 GIA 3.80 x 3.82 x 2.31 NaN 3.80 3.82 2.31
5 0.31 V.Good K I1 59.0 62.2 EGL 4.35 x 4.26 x 2.68 NaN 4.35 4.26 2.68

In [20]:
df.columns


Out[20]:
Index([u'carat', u'cut', u'color', u'clarity', u'table', u'depth', u'cert',
       u'measurements', u'price', u'x', u'y', u'z'],
      dtype='object')

In [21]:
df['color'].unique()


Out[21]:
array(['K', 'G', 'J', 'D', 'F', 'E', 'H', 'I', 'L'], dtype=object)

In [22]:
df.describe()


Out[22]:
carat table depth price x y z
count 598024.000000 598024.000000 598024.000000 597311.000000 596209.000000 596172.000000 595480.000000
mean 1.071297 57.631077 61.063683 8753.017974 5.990771 6.198671 4.033430
std 0.812696 4.996892 7.604342 13017.567760 1.530936 1.485891 1.240951
min 0.200000 0.000000 0.000000 300.000000 0.150000 1.000000 0.040000
25% 0.500000 56.000000 61.000000 1220.000000 4.740000 4.970000 3.120000
50% 0.900000 58.000000 62.100000 3503.000000 5.780000 6.050000 3.860000
75% 1.500000 59.000000 62.700000 11174.000000 6.970000 7.230000 4.610000
max 9.250000 75.900000 81.300000 99990.000000 13.890000 13.890000 13.180000

In [23]:
df['x'].plot(kind='box')


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f25cff93450>

In [1]:
# Learn more about box plots
# http://www.physics.csbsju.edu/stats/box2.html