In [1]:
#### Introduction to Data Wrangling with Pandas ####
## Page 2 ##
In [2]:
#### Performing basic statistical operations ####
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#so that we can view the graphs inside the notebook
In [4]:
s = pd.Series([1, 2, np.nan, 3], name="num")
In [5]:
s
Out[5]:
In [6]:
df1 = pd.DataFrame(s)
In [7]:
df1
Out[7]:
In [11]:
#If our age dataset is an year old
#df[age_now]= df[age]+1
In [8]:
df1['value'] = df1['num']*2
# internally for each value in column num perform each_value*2 and save it as the corresponding
# result in the value column
df1
Out[8]:
In [9]:
len(df1['num'])
Out[9]:
In [10]:
df1['num'].count()
Out[10]:
In [11]:
df1['num'].mean()
Out[11]:
In [12]:
df1['num'].sum()
Out[12]:
In [13]:
df1['num'].median()
Out[13]:
In [14]:
df1['num'].std()
Out[14]:
In [13]:
df1['num'].min()
Out[13]:
In [14]:
df1['num'].max()
Out[14]:
In [20]:
df1['num'].describe()
Out[20]:
In [15]:
df1['num'].cumsum()
Out[15]:
In [21]:
df1.mean()
Out[21]:
In [22]:
# When you give the whole dataframe, then all numerical columns will be analysis
In [23]:
df1.corr()
Out[23]:
In [24]:
# Corr between num and value is 1 because value is the double of num
In [15]:
df1['num'].plot(kind='box')
Out[15]:
In [26]:
# The box is the interquartile range, redline-median, lower blue is 25% quartile and uppper blue is 75% quartile
In [16]:
df = pd.read_csv('~/diamonds-data/diamonds.csv', index_col='Unnamed: 0')
In [28]:
len(df)
Out[28]:
In [17]:
df.head()
Out[17]:
In [18]:
df = df.fillna(0).head()
# not an inplace filling
Out[18]:
In [19]:
df.head()
Out[19]:
In [20]:
df.columns
Out[20]:
In [21]:
df['color'].unique()
Out[21]:
In [22]:
df.describe()
Out[22]:
In [23]:
df['x'].plot(kind='box')
Out[23]:
In [1]:
# Learn more about box plots
# http://www.physics.csbsju.edu/stats/box2.html