Table of Contents

Numpy


In [255]:
import numpy as np

Create Numpy Array from Python List


In [256]:
x = np.array([1, 2, 3, 4, 5])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)

x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)


[1 2 3 4 5]
<class 'numpy.ndarray'>
int64
(5,)
5
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]
<class 'numpy.ndarray'>
int64
(4, 3)
12

Create Numpy Array from Built-in Functions


In [257]:
x = np.zeros((3, 4))
print(x)
print(x.dtype)


[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
float64

In [258]:
x = np.ones((3, 4), dtype=int)
print(x)
print(x.dtype)


[[1 1 1 1]
 [1 1 1 1]
 [1 1 1 1]]
int64

In [259]:
x = np.full((3, 4), 5)
print(x)


[[5 5 5 5]
 [5 5 5 5]
 [5 5 5 5]]

In [260]:
x = np.eye(5, dtype=int)
print(x)


[[1 0 0 0 0]
 [0 1 0 0 0]
 [0 0 1 0 0]
 [0 0 0 1 0]
 [0 0 0 0 1]]

In [261]:
x = np.diag([10, 20, 30, 40])
print(x)


[[10  0  0  0]
 [ 0 20  0  0]
 [ 0  0 30  0]
 [ 0  0  0 40]]

In [262]:
x = np.arange(4, 10)
print(x)


[4 5 6 7 8 9]

In [263]:
x = np.arange(1, 20, 3)
print(x)


[ 1  4  7 10 13 16 19]

In [264]:
x = np.linspace(1, 20, 3)
print(x)


[ 1.  10.5 20. ]

In [265]:
x = np.arange(20)
print(x)

x = np.reshape(x, (4, 5))
print(x)

x = np.arange(20).reshape(4, 5)
print(x)


[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]

In [266]:
# Defaults to range [0, 1)
x = np.random.random((3, 3))
print(x)

x = np.random.randint(4, 10, (3, 3))
print(x)


[[0.84701144 0.24795452 0.96356938]
 [0.06035776 0.43578403 0.94127149]
 [0.77229273 0.00347168 0.85898623]]
[[4 9 5]
 [4 6 6]
 [9 5 7]]

In [267]:
# mean = 0, std = 0.1
x = np.random.normal(0, 0.1, (5, 5))
print(x)

print(x.mean())
print(x.std())


[[-0.10627019  0.09512631  0.08031865 -0.07410845  0.07270698]
 [-0.19585703 -0.08315314 -0.01766702 -0.12364803 -0.1563959 ]
 [-0.11398013 -0.07062806  0.10359602  0.278749    0.08053277]
 [ 0.10407057 -0.07610102  0.20789975 -0.01227405 -0.05789641]
 [-0.1193928   0.05199796 -0.00707654 -0.10544349  0.07731245]]
-0.006703271290734083
0.11526196988483964

Accessing, Deleting and Inserting Elements into NDArrays


In [268]:
x = np.array([1, 2, 3, 4, 5])

print(x[0])
print(x[2])
print(x[-1])
print(x[-3])


1
3
5
3

In [269]:
## Get  diagonal of a 2d array

x = np.arange(25).reshape(5, 5)
print(x)

print(np.diag(x))
print(np.diag(x, k=1))
print(np.diag(x, k=-2))


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
[ 0  6 12 18 24]
[ 1  7 13 19]
[10 16 22]

In [270]:
## Get unique elements of an array

x = np.array([1, 2, 3, 4, 2, 1, 1, 2, 5])
print(np.unique(x))


[1 2 3 4 5]

In [271]:
x = np.arange(1, 10).reshape(3, 3)

print(x)
print(x[0, 0])
print(x[1, 0])
print(x[2, 1])

## Modify element
x[2, 2] = -9
print(x)


[[1 2 3]
 [4 5 6]
 [7 8 9]]
1
4
8
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8 -9]]

In [272]:
## Delete Rows by Index

x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=0))


[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[3 4 5]]

In [273]:
## Delete Columns by Index

x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=1))


[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[1]
 [4]
 [7]]

In [274]:
## Append Row

x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9, 10, 11]], axis=0))


[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]

In [275]:
## Append Column

x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9], [10], [11]], axis=1))


[[0 1 2]
 [3 4 5]
 [6 7 8]]
[[ 0  1  2  9]
 [ 3  4  5 10]
 [ 6  7  8 11]]

In [276]:
## Insert Elements - 1D / Rank 1 Arrays

x = np.array([1, 2, 5, 6, 7, 8, 9, 10])
print(x)
print(np.insert(x, 2, [3, 4]))


[ 1  2  5  6  7  8  9 10]
[ 1  2  3  4  5  6  7  8  9 10]

In [277]:
## Insert Row at Specified Index - 2D Array

x = np.array([[1, 2, 3], [7, 8, 9]])
print(x)
print(np.insert(x, 1, [4, 5, 6], axis=0))


[[1 2 3]
 [7 8 9]]
[[1 2 3]
 [4 5 6]
 [7 8 9]]

In [278]:
x = np.array([[1, 2], [4, 5]])
print(x)
print(np.insert(x, 2, [3, 6], axis=1))
print(np.insert(x, 2, 9, axis=1))


[[1 2]
 [4 5]]
[[1 2 3]
 [4 5 6]]
[[1 2 9]
 [4 5 9]]

In [279]:
## Stack 2 Arrays - Vertically

x = np.array([1, 2])
y = np.array([[3, 4], [5, 6]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"vstack=\n {np.vstack((x, y))}")


x=
[1 2]
y=
[[3 4]
 [5 6]]
vstack=
 [[1 2]
 [3 4]
 [5 6]]

In [280]:
## Stack 2 Arrays - Horizontally

x = np.array([[3], [6]])
y = np.array([[1, 2], [4, 5]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"hstack=\n {np.hstack((y, x))}")


x=
[[3]
 [6]]
y=
[[1 2]
 [4 5]]
hstack=
 [[1 2 3]
 [4 5 6]]

Slicing NDArrays

Slicing only creates new "views" on the original array, not new copies of the sliced array. To create a copy, use the copy() method.


In [281]:
x = np.arange(1, 21).reshape(4, 5)
print(x)

print(x[0:2, 0:2])

## Notice the subtle difference between the followig
print(x[:, 0:1])
print(x[:, 0])


[[ 1  2  3  4  5]
 [ 6  7  8  9 10]
 [11 12 13 14 15]
 [16 17 18 19 20]]
[[1 2]
 [6 7]]
[[ 1]
 [ 6]
 [11]
 [16]]
[ 1  6 11 16]

Boolean Indexing


In [282]:
x = np.arange(25).reshape(5, 5)
print(x)

print(x[(x > 10) & (x < 17)])


[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]
 [20 21 22 23 24]]
[11 12 13 14 15 16]

Set Operations


In [283]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([6, 8, 3, 2, 9])

print(np.intersect1d(x, y))
print(np.setdiff1d(x, y))
print(np.union1d(x, y))


[2 3]
[1 4 5]
[1 2 3 4 5 6 8 9]

Sorting


In [284]:
x = np.random.randint(1, 11, size=(10, ))
print(x)

## Out-of-place sorting
print(f"oop sorted= \n {np.sort(x)}")
print(f"original= \n {x}")

## In-place sorting
x.sort()
print(f"ip sorted= \n {x}")


[ 6 10  4  5  3  6  5  3 10 10]
oop sorted= 
 [ 3  3  4  5  5  6  6 10 10 10]
original= 
 [ 6 10  4  5  3  6  5  3 10 10]
ip sorted= 
 [ 3  3  4  5  5  6  6 10 10 10]

Numpy <=> Torch Tensor


In [285]:
import torch
import numpy as np
a = np.random.rand(4,3)
a


Out[285]:
array([[0.53173224, 0.37468615, 0.09600771],
       [0.16019028, 0.87208775, 0.96772617],
       [0.30046583, 0.0997555 , 0.9653974 ],
       [0.65708248, 0.37446061, 0.60867549]])

In [286]:
b = torch.from_numpy(a)
b


Out[286]:
tensor([[0.5317, 0.3747, 0.0960],
        [0.1602, 0.8721, 0.9677],
        [0.3005, 0.0998, 0.9654],
        [0.6571, 0.3745, 0.6087]], dtype=torch.float64)

In [287]:
b.numpy()


Out[287]:
array([[0.53173224, 0.37468615, 0.09600771],
       [0.16019028, 0.87208775, 0.96772617],
       [0.30046583, 0.0997555 , 0.9653974 ],
       [0.65708248, 0.37446061, 0.60867549]])

Pandas


In [288]:
import pandas as pd

Display Options


In [289]:
display = pd.options.display
display.max_rows=10
display.max_columns=10

Pandas Series

Create


In [290]:
### With default integer indices
groceries = pd.Series(data=[30, 6, 'Foo', 'Bar'])
print(groceries)

### With custom indices
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries)


0     30
1      6
2    Foo
3    Bar
dtype: object
egg        30
apples      6
milk      Yes
bread      No
dtype: object

Attributes


In [291]:
print(groceries.shape)
print(groceries.ndim)
print(groceries.size)
print(groceries.index)
print(groceries.values)
print('bananas' in groceries)
print('apples' in groceries)


(4,)
1
4
Index(['egg', 'apples', 'milk', 'bread'], dtype='object')
[30 6 'Yes' 'No']
False
True

Accessing Data


In [292]:
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries['egg'])

print('====\n')

## By labels
print(groceries[['egg', 'apples']])
print(groceries.loc[['egg', 'apples']])
print('====\n')

## By index
print(groceries[[0, -1]])
print(groceries.iloc[[0, -1]])


30
====

egg       30
apples     6
dtype: object
egg       30
apples     6
dtype: object
====

egg      30
bread    No
dtype: object
egg      30
bread    No
dtype: object

Modify Series


In [293]:
## Change Element Values
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries[['egg']] = 31
print(groceries)


egg        31
apples      6
milk      Yes
bread      No
dtype: object

In [294]:
## Drop Elements - Out-of-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries.drop(['apples']))
print(groceries)


egg       30
milk     Yes
bread     No
dtype: object
egg        30
apples      6
milk      Yes
bread      No
dtype: object

In [295]:
## Drop Elements - In-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries.drop(['apples'], inplace=True)
print(groceries)


egg       30
milk     Yes
bread     No
dtype: object

Arithmetic Operations


In [296]:
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])
fruits + 1


Out[296]:
apples     11
oranges     7
bananas     4
dtype: int64

In [297]:
np.sqrt(fruits)


Out[297]:
apples     3.162278
oranges    2.449490
bananas    1.732051
dtype: float64

In [298]:
fruits[['bananas', 'oranges']] * 10


Out[298]:
bananas    30
oranges    60
dtype: int64

Pandas DataFrames

Create


In [299]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55]),
         'Alice' : pd.Series(data = [40, 110, 500, 45])}

# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts


<class 'dict'>
Out[299]:
Bob Alice
0 245.0 40
1 25.0 110
2 55.0 500
3 NaN 45

In [300]:
## Create DF from csv
# df = pd.read_csv('myfile.csv')

In [301]:
# We create a dictionary of Pandas Series 
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}

# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts


<class 'dict'>
Out[301]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN

In [302]:
## Creating DF Using Subset of Dict

# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
alice_sel_shopping_cart


Out[302]:
Alice
glasses 110
bike 500

Attributes


In [303]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
shopping_carts = pd.DataFrame(items)

shopping_carts.shape


Out[303]:
(5, 2)

In [304]:
shopping_carts.ndim


Out[304]:
2

In [305]:
shopping_carts.columns


Out[305]:
Index(['Bob', 'Alice'], dtype='object')

In [306]:
shopping_carts.values


Out[306]:
array([[245., 500.],
       [ nan,  40.],
       [ nan, 110.],
       [ 25.,  45.],
       [ 55.,  nan]])

Accessing Data


In [307]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df


Out[307]:
Bob Alice Charlie
bike 245.0 500.0 70.0
book NaN 40.0 45.0
glasses NaN 110.0 90.0
pants 25.0 45.0 450.0
watch 55.0 NaN NaN

Access column(s) by label


In [308]:
df[['Bob', 'Alice']]


Out[308]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN

In [309]:
df.loc[:, ['Bob', 'Alice']]


Out[309]:
Bob Alice
bike 245.0 500.0
book NaN 40.0
glasses NaN 110.0
pants 25.0 45.0
watch 55.0 NaN

Access column(s) by index


In [339]:
print(df)
df.iloc[:, [0, 2]]


      id      species  generation_id  height  weight  ...  attack defense  \
0      1    bulbasaur              1     0.7     6.9  ...      49      49   
1      2      ivysaur              1     1.0    13.0  ...      62      63   
2      3     venusaur              1     2.0   100.0  ...      82      83   
3      4   charmander              1     0.6     8.5  ...      52      43   
4      5   charmeleon              1     1.1    19.0  ...      64      58   
..   ...          ...            ...     ...     ...  ...     ...     ...   
802  803      poipole              7     0.6     1.8  ...      73      67   
803  804    naganadel              7     3.6   150.0  ...      73      73   
804  805    stakataka              7     5.5   820.0  ...     131     211   
805  806  blacephalon              7     1.8    13.0  ...     127      53   
806  807      zeraora              7     1.5    44.5  ...     112      75   

    speed  special-attack  special-defense  
0      45              65               65  
1      60              80               80  
2      80             100              100  
3      65              60               50  
4      80              80               65  
..    ...             ...              ...  
802    73              73               67  
803   121             127               73  
804    13              53              101  
805   107             151               79  
806   143             102               80  

[807 rows x 14 columns]
Out[339]:
id generation_id
0 1 1
1 2 1
2 3 1
3 4 1
4 5 1
... ... ...
802 803 7
803 804 7
804 805 7
805 806 7
806 807 7

807 rows × 2 columns

Access row(s) by label


In [ ]:
df.loc[['bike', 'pants']]

Access row(s) by index


In [ ]:
display(df)
df.iloc[[0, 2]]

Get N rows from a DF


In [ ]:
df[:3]

Get N random rows from a DF


In [ ]:
df.sample(n=2)

Access element by row and column label


In [ ]:
df['Alice']['bike']  # Column label always comes first

Get all rows where column value satisfies condition


In [ ]:
display(df)
df.loc[df['Bob'] > 40]

Modify DF


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df

Add column


In [ ]:
df['Dan'] = [1, 2, 3, 4, 5]
df

Append columns from a DF to another DF


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
df

items_new = {'Dan' : pd.Series(data = [1, 2, 3], index = ['bike', 'pants', 'watch']),}
df_new = pd.DataFrame(items_new)
df.join(df_new)

Insert column at index


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)

df.insert(1, 'Dan', [1, 2, 3, 4, 5])
df

Add column using sum of previous columns values


In [ ]:
df['Total'] = df['Bob'] + df['Alice'] + df['Charlie'] + df['Dan']
df

Add rows


In [ ]:
new_item = {'Bob': 1, 'Alice': 2, 'Charlie': 2}
new_df = pd.DataFrame(new_item, index = ['phones'])
display(new_df)

display(df.append(new_df))

Delete column


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
display(df)
df.pop('Bob')
display(df)

Delete multiple columns


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}

df = pd.DataFrame(items)
display(df)

display(df.drop(['Bob', 'Alice'], axis=1)) # 1 = columns

Delete multiple rows


In [ ]:
display(df.drop(['watch', 'book'], axis=0)) # axis=0 => row / index

Transform values of selected columns


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
         'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)

columns_to_tranform = ['Bob', 'Charlie']
df[columns_to_tranform] = df[columns_to_tranform].apply(lambda x: x * 100)
display(df)

In [ ]:
#### Substitute values in all columns of a DF
df.replace([40, 7000], ['Foo', 'Bar'])

 Dealing with NaN


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
         'Charlie': pd.Series(data = [45, 90, 70, 450, 1], index = ['book', 'glasses', 'bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)

In [ ]:
## Counting NaNs
df.isnull().sum().sum()

In [ ]:
## Counting NaNs per column
df.isnull().sum()

In [ ]:
## Plot NaN count per column
import matplotlib.pyplot as plt

%matplotlib inline

df_nans = df.isnull().sum()
plt.bar(x=df_nans.index, height=df_nans.values)
plt.xticks(rotation = 90);

In [ ]:
## Counting non-NaNs
df.count().sum()

In [ ]:
## Drop rows with NaNs
display(df.dropna(axis=0))

In [ ]:
## Drop columns with NaNs
display(df.dropna(axis=1))

In [ ]:
## Replace all NaNs with 0
display(df.fillna(0))

In [ ]:
## Forward fill NaNs (value of previous row)
display(df)
display(df.fillna(method='ffill', axis=0)) # Other methods = 'backfill', 'linear'. Axis can be 1

Statistical Analysis


In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
         'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
         'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}

df = pd.DataFrame(items)
display(df)

In [ ]:
## Describe statistical information of DF
df.describe()

In [ ]:
df['Bob'].describe()

In [ ]:
df.mean()

In [ ]:
df.max()

Data Visualisation


In [312]:
import pandas as pd
import seaborn as sb

df = pd.read_csv('pokemon.csv')
df.head()


Out[312]:
id species generation_id height weight ... attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 ... 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 ... 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 ... 82 83 80 100 100
3 4 charmander 1 0.6 8.5 ... 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 ... 64 58 80 80 65

5 rows × 14 columns

Univariate Data

Categorical data frequency/count as bar chart


In [313]:
sb.countplot(data = df, x = 'generation_id')
plt.title('My plot')


Out[313]:
Text(0.5, 1.0, 'My plot')

In [314]:
## Single color bars
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'generation_id', color = base_color)


Out[314]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a28ef9a10>

In [315]:
## Sort left to right
gen_order = df['generation_id'].value_counts().index
sb.countplot(data = df, x = 'generation_id', order = gen_order)


Out[315]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a28eb8710>

In [316]:
## Rotate x tick labels

## Without rotation
sb.countplot(data = df, x = 'type_1')


Out[316]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a29074c90>

In [317]:
## With rotation
import matplotlib.pyplot as plt

plt.xticks(rotation = 90)

sb.countplot(data = df, x = 'type_1')


Out[317]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a2919b8d0>

In [318]:
## Plot Y Bars
sb.countplot(data = df, y = 'type_1')


Out[318]:
<matplotlib.axes._subplots.AxesSubplot at 0x1a29074b10>

Categorical data relative frequency as bar chart


In [319]:
import numpy as np
import matplotlib.pyplot as plt

n_points = df.shape[0]
max_count = df['generation_id'].value_counts().max()
max_percent = max_count / n_points

tick_props = np.arange(0, max_percent,  0.05)
tick_names = ['{:0.2f}'.format(v) for v in tick_props]

sb.countplot(data = df, x = 'generation_id')
plt.yticks(tick_props * n_points, tick_names)
plt.ylabel('proportion')


Out[319]:
Text(0, 0.5, 'proportion')

Using Barplot to visualise processed data (not already stored as a column value)


In [320]:
df.isna().sum()
sb.barplot(df.isna().sum().index.values, df.isna().sum())
plt.xticks(rotation = 90)


Out[320]:
(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13]),
 <a list of 14 Text xticklabel objects>)

Numerical data histograms


In [321]:
df.head()


Out[321]:
id species generation_id height weight ... attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 ... 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 ... 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 ... 82 83 80 100 100
3 4 charmander 1 0.6 8.5 ... 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 ... 64 58 80 80 65

5 rows × 14 columns


In [322]:
plt.hist(data = df, x = 'speed', bins = 20);



In [323]:
bins = np.arange(0, df['speed'].max()+5, 5)
plt.hist(data = df, x = 'speed', bins = bins);



In [324]:
sb.distplot(df['speed']);



In [325]:
sb.distplot(df['speed'], kde=False);


Subplots (Stack Plots Horizontally)


In [326]:
import matplotlib.pyplot as plt

plt.figure(figsize = [15, 5])

plt.subplot(1, 2, 1)                    # 1 row, 2 cols, subplot 1
sb.distplot(df['speed'], kde=False);

plt.subplot(1, 2, 2)                    # 1 row, 2 cols, subplot 2
sb.distplot(df['speed']);


Plot Subset of Data (Axis Range Limits)


In [327]:
plt.hist(data = df, x = 'height');



In [328]:
plt.hist(data = df, x = 'height');
plt.xlim((0, 2))


Out[328]:
(0, 2)

Axis Transformations (Log Scale)


In [329]:
## Original plot (with linear scale)
plt.hist(data = df, x = 'weight');



In [330]:
## Plots with log scales for x-axis
plt.figure(figsize = [15, 5])

plt.subplot(1, 2, 1)    
sb.distplot(df['weight'], kde=False)
plt.xscale('log')

plt.subplot(1, 2, 2)    
plt.hist(data = df, x = 'weight');
plt.xscale('log')



In [331]:
## Changing x range, whilst in log scale to better visualise data distribution
plt.xscale('log')

min = np.log10(df['weight'].min())
max = np.log10(df['weight'].max())
bins = 10 ** np.arange(min, max + 0.1, 0.1)

plt.hist(data = df, x = 'weight', bins = bins);


Bivariate Data

Pairwise relationship between numerical columns


In [332]:
sb.pairplot(df, hue='generation_id');


Categorical data grouped-by another label


In [333]:
df.head()


Out[333]:
id species generation_id height weight ... attack defense speed special-attack special-defense
0 1 bulbasaur 1 0.7 6.9 ... 49 49 45 65 65
1 2 ivysaur 1 1.0 13.0 ... 62 63 60 80 80
2 3 venusaur 1 2.0 100.0 ... 82 83 80 100 100
3 4 charmander 1 0.6 8.5 ... 52 43 65 60 50
4 5 charmeleon 1 1.1 19.0 ... 64 58 80 80 65

5 rows × 14 columns


In [334]:
import matplotlib.pyplot as plt

sb.catplot(x='type_1', kind='count', hue='generation_id', data=df, height=10, aspect=10/7.5);
# sb.countplot(x='type_1', hue='generation_id', data=df)
plt.xticks(rotation = 90);


Categorical and numerical data relationships


In [335]:
plt.scatter(data=df, x='speed', y='type_1')
plt.xlabel('speed')
plt.ylabel('type_1')


Out[335]:
Text(0, 0.5, 'type_1')

Numerical data relationships - with linear regression estimate


In [336]:
sb.regplot(data=df, x='speed', y='height')


Out[336]:
<matplotlib.axes._subplots.AxesSubplot at 0x10c10d650>

Anaconda

List envs

conda info --envs

Activate env

conda activate <env_name>

Update all packages

conda upgrade -all

Install package

conda install package_name

## specifying package version
conda install numpy=1.10

Remove package

conda remove package_name

Search package

conda search *search_term*

List packages

conda list

Jupyter

Convert notebook to html

jupyter nbconvert --to html notebook.ipynb

# Other formats
# https://nbconvert.readthedocs.io/en/latest/usage.html

In [ ]: