notebook.community

Edit and run



In [1]:

    
#### Introduction to Data Wrangling with Pandas ####
## Page 2 ##



In [2]:

    
#### Performing basic statistical operations ####



In [3]:

    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 
#so that we can view the graphs inside the notebook



In [4]:

    
s = pd.Series([1, 2, np.nan, 3], name="num")



In [5]:

    
s









    Out[5]:





0    1.0
1    2.0
2    NaN
3    3.0
Name: num, dtype: float64



In [6]:

    
df1 = pd.DataFrame(s)



In [7]:

    
df1



In [11]:

    
#If our age dataset is an year old
#df[age_now]= df[age]+1



In [8]:

    
df1['value'] = df1['num']*2
# internally for each value in column num perform each_value*2 and save it as the corresponding
# result in the value column
df1



In [9]:

    
len(df1['num'])









    Out[9]:





4



In [10]:

    
df1['num'].count()









    Out[10]:





3



In [11]:

    
df1['num'].mean()









    Out[11]:





2.0



In [12]:

    
df1['num'].sum()









    Out[12]:





6.0



In [13]:

    
df1['num'].median()









    Out[13]:





2.0



In [14]:

    
df1['num'].std()









    Out[14]:





1.0



In [13]:

    
df1['num'].min()









    Out[13]:





1.0



In [14]:

    
df1['num'].max()









    Out[14]:





3.0



In [20]:

    
df1['num'].describe()









    Out[20]:





count    3.0
mean     2.0
std      1.0
min      1.0
25%      1.5
50%      2.0
75%      2.5
max      3.0
Name: num, dtype: float64



In [15]:

    
df1['num'].cumsum()









    Out[15]:





0    1.0
1    3.0
2    NaN
3    6.0
Name: num, dtype: float64



In [21]:

    
df1.mean()









    Out[21]:





num      2.0
value    4.0
dtype: float64



In [22]:

    
# When you give the whole dataframe, then all numerical columns will be analysis



In [23]:

    
df1.corr()



In [24]:

    
# Corr between num and value is 1 because value is the double of num



In [15]:

    
df1['num'].plot(kind='box')









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f25d067a0d0>



In [26]:

    
# The box is the interquartile range, redline-median, lower blue is 25% quartile and uppper blue is 75% quartile



In [16]:

    
df = pd.read_csv('~/diamonds-data/diamonds.csv', index_col='Unnamed: 0')



In [28]:

    
len(df)









    Out[28]:





598024



In [17]:

    
df.head()









    Out[17]:






  
    
      
      carat
      cut
      color
      clarity
      table
      depth
      cert
      measurements
      price
      x
      y
      z
    
  
  
    
      1
      0.25
      V.Good
      K
      I1
      59.0
      63.7
      GIA
      3.96 x 3.95 x 2.52
      NaN
      3.96
      3.95
      2.52
    
    
      2
      0.23
      Good
      G
      I1
      61.0
      58.1
      GIA
      4.00 x 4.05 x 2.30
      NaN
      4.00
      4.05
      2.30
    
    
      3
      0.34
      Good
      J
      I2
      58.0
      58.7
      GIA
      4.56 x 4.53 x 2.67
      NaN
      4.56
      4.53
      2.67
    
    
      4
      0.21
      V.Good
      D
      I1
      60.0
      60.6
      GIA
      3.80 x 3.82 x 2.31
      NaN
      3.80
      3.82
      2.31
    
    
      5
      0.31
      V.Good
      K
      I1
      59.0
      62.2
      EGL
      4.35 x 4.26 x 2.68
      NaN
      4.35
      4.26
      2.68



In [18]:

    
df = df.fillna(0).head()
# not an inplace filling









    Out[18]:






  
    
      
      carat
      cut
      color
      clarity
      table
      depth
      cert
      measurements
      price
      x
      y
      z
    
  
  
    
      1
      0.25
      V.Good
      K
      I1
      59.0
      63.7
      GIA
      3.96 x 3.95 x 2.52
      0.0
      3.96
      3.95
      2.52
    
    
      2
      0.23
      Good
      G
      I1
      61.0
      58.1
      GIA
      4.00 x 4.05 x 2.30
      0.0
      4.00
      4.05
      2.30
    
    
      3
      0.34
      Good
      J
      I2
      58.0
      58.7
      GIA
      4.56 x 4.53 x 2.67
      0.0
      4.56
      4.53
      2.67
    
    
      4
      0.21
      V.Good
      D
      I1
      60.0
      60.6
      GIA
      3.80 x 3.82 x 2.31
      0.0
      3.80
      3.82
      2.31
    
    
      5
      0.31
      V.Good
      K
      I1
      59.0
      62.2
      EGL
      4.35 x 4.26 x 2.68
      0.0
      4.35
      4.26
      2.68



In [19]:

    
df.head()









    Out[19]:






  
    
      
      carat
      cut
      color
      clarity
      table
      depth
      cert
      measurements
      price
      x
      y
      z
    
  
  
    
      1
      0.25
      V.Good
      K
      I1
      59.0
      63.7
      GIA
      3.96 x 3.95 x 2.52
      NaN
      3.96
      3.95
      2.52
    
    
      2
      0.23
      Good
      G
      I1
      61.0
      58.1
      GIA
      4.00 x 4.05 x 2.30
      NaN
      4.00
      4.05
      2.30
    
    
      3
      0.34
      Good
      J
      I2
      58.0
      58.7
      GIA
      4.56 x 4.53 x 2.67
      NaN
      4.56
      4.53
      2.67
    
    
      4
      0.21
      V.Good
      D
      I1
      60.0
      60.6
      GIA
      3.80 x 3.82 x 2.31
      NaN
      3.80
      3.82
      2.31
    
    
      5
      0.31
      V.Good
      K
      I1
      59.0
      62.2
      EGL
      4.35 x 4.26 x 2.68
      NaN
      4.35
      4.26
      2.68



In [20]:

    
df.columns









    Out[20]:





Index([u'carat', u'cut', u'color', u'clarity', u'table', u'depth', u'cert',
       u'measurements', u'price', u'x', u'y', u'z'],
      dtype='object')



In [21]:

    
df['color'].unique()









    Out[21]:





array(['K', 'G', 'J', 'D', 'F', 'E', 'H', 'I', 'L'], dtype=object)



In [22]:

    
df.describe()









    Out[22]:






  
    
      
      carat
      table
      depth
      price
      x
      y
      z
    
  
  
    
      count
      598024.000000
      598024.000000
      598024.000000
      597311.000000
      596209.000000
      596172.000000
      595480.000000
    
    
      mean
      1.071297
      57.631077
      61.063683
      8753.017974
      5.990771
      6.198671
      4.033430
    
    
      std
      0.812696
      4.996892
      7.604342
      13017.567760
      1.530936
      1.485891
      1.240951
    
    
      min
      0.200000
      0.000000
      0.000000
      300.000000
      0.150000
      1.000000
      0.040000
    
    
      25%
      0.500000
      56.000000
      61.000000
      1220.000000
      4.740000
      4.970000
      3.120000
    
    
      50%
      0.900000
      58.000000
      62.100000
      3503.000000
      5.780000
      6.050000
      3.860000
    
    
      75%
      1.500000
      59.000000
      62.700000
      11174.000000
      6.970000
      7.230000
      4.610000
    
    
      max
      9.250000
      75.900000
      81.300000
      99990.000000
      13.890000
      13.890000
      13.180000



In [23]:

    
df['x'].plot(kind='box')









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f25cff93450>



In [1]:

    
# Learn more about box plots
# http://www.physics.csbsju.edu/stats/box2.html

	carat	cut	color	clarity	table	depth	cert	measurements	price	x	y	z
1	0.25	V.Good	K	I1	59.0	63.7	GIA	3.96 x 3.95 x 2.52	NaN	3.96	3.95	2.52
2	0.23	Good	G	I1	61.0	58.1	GIA	4.00 x 4.05 x 2.30	NaN	4.00	4.05	2.30
3	0.34	Good	J	I2	58.0	58.7	GIA	4.56 x 4.53 x 2.67	NaN	4.56	4.53	2.67
4	0.21	V.Good	D	I1	60.0	60.6	GIA	3.80 x 3.82 x 2.31	NaN	3.80	3.82	2.31
5	0.31	V.Good	K	I1	59.0	62.2	EGL	4.35 x 4.26 x 2.68	NaN	4.35	4.26	2.68

	carat	table	depth	price	x	y	z
count	598024.000000	598024.000000	598024.000000	597311.000000	596209.000000	596172.000000	595480.000000
mean	1.071297	57.631077	61.063683	8753.017974	5.990771	6.198671	4.033430
std	0.812696	4.996892	7.604342	13017.567760	1.530936	1.485891	1.240951
min	0.200000	0.000000	0.000000	300.000000	0.150000	1.000000	0.040000
25%	0.500000	56.000000	61.000000	1220.000000	4.740000	4.970000	3.120000
50%	0.900000	58.000000	62.100000	3503.000000	5.780000	6.050000	3.860000
75%	1.500000	59.000000	62.700000	11174.000000	6.970000	7.230000	4.610000
max	9.250000	75.900000	81.300000	99990.000000	13.890000	13.890000	13.180000