In [1]:
#### Introduction to Data Wrangling with Pandas ####
## Page 6 ##
In [2]:
#### Recap - Exercise our learning ####
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
#so that we can view the graphs inside the notebook
In [4]:
df = pd.read_csv('~/diamonds-data/diamonds.csv', index_col='Unnamed: 0')
In [5]:
len(df)
Out[5]:
In [6]:
df.columns
Out[6]:
In [7]:
df.head() # filter operation
Out[7]:
In [8]:
df.tail() #filter operation
Out[8]:
In [9]:
len_old = len(df)
In [10]:
df = df.dropna(how='any') # another form of filter operation
In [11]:
"""
No. of rows with atleast one NA value
"""
len_old - len(df)
Out[11]:
In [12]:
df.describe()
Out[12]:
In [13]:
non_numeric = ['clarity', 'cut', 'color', 'cert'] #catgorical values-ordered
In [14]:
for name in non_numeric:
print(name, df[name].unique())
In [17]:
df.groupby('color').count()
Out[17]:
In [19]:
df.groupby('color')['carat'].count().plot(kind='bar')
Out[19]:
In [20]:
df.groupby('cut')['carat'].count().plot(kind='bar', color='red')
Out[20]:
In [21]:
df.corr()
Out[21]:
In [22]:
df.corr()>0.7
Out[22]:
In [23]:
df_subset = df.loc[:, ['carat', 'x', 'y', 'price']] #subset-slice-by-label index
In [24]:
df_subset = df[['carat', 'x', 'y', 'price']] #subset-slice-by-column name
In [25]:
df_subset = df.iloc[:, [0, 8, 9, 10]] #subset-slice-by-numeric index
In [26]:
df_subset.plot.scatter(x='carat', y='price')
Out[26]:
In [27]:
df.plot.scatter(x='carat', y='price', alpha=0.1, xlim=(4,6))
Out[27]:
In [28]:
df.plot.scatter(x='carat', y='price', alpha=0.1, xlim=(5,5.2))
Out[28]:
In [ ]: