In [255]:
import numpy as np
In [256]:
x = np.array([1, 2, 3, 4, 5])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)
x = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]])
print(x)
print(type(x))
print(x.dtype)
print(x.shape)
print(x.size)
In [257]:
x = np.zeros((3, 4))
print(x)
print(x.dtype)
In [258]:
x = np.ones((3, 4), dtype=int)
print(x)
print(x.dtype)
In [259]:
x = np.full((3, 4), 5)
print(x)
In [260]:
x = np.eye(5, dtype=int)
print(x)
In [261]:
x = np.diag([10, 20, 30, 40])
print(x)
In [262]:
x = np.arange(4, 10)
print(x)
In [263]:
x = np.arange(1, 20, 3)
print(x)
In [264]:
x = np.linspace(1, 20, 3)
print(x)
In [265]:
x = np.arange(20)
print(x)
x = np.reshape(x, (4, 5))
print(x)
x = np.arange(20).reshape(4, 5)
print(x)
In [266]:
# Defaults to range [0, 1)
x = np.random.random((3, 3))
print(x)
x = np.random.randint(4, 10, (3, 3))
print(x)
In [267]:
# mean = 0, std = 0.1
x = np.random.normal(0, 0.1, (5, 5))
print(x)
print(x.mean())
print(x.std())
In [268]:
x = np.array([1, 2, 3, 4, 5])
print(x[0])
print(x[2])
print(x[-1])
print(x[-3])
In [269]:
## Get diagonal of a 2d array
x = np.arange(25).reshape(5, 5)
print(x)
print(np.diag(x))
print(np.diag(x, k=1))
print(np.diag(x, k=-2))
In [270]:
## Get unique elements of an array
x = np.array([1, 2, 3, 4, 2, 1, 1, 2, 5])
print(np.unique(x))
In [271]:
x = np.arange(1, 10).reshape(3, 3)
print(x)
print(x[0, 0])
print(x[1, 0])
print(x[2, 1])
## Modify element
x[2, 2] = -9
print(x)
In [272]:
## Delete Rows by Index
x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=0))
In [273]:
## Delete Columns by Index
x = np.arange(9).reshape(3, 3)
print(x)
print(np.delete(x, [0, 2], axis=1))
In [274]:
## Append Row
x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9, 10, 11]], axis=0))
In [275]:
## Append Column
x = np.arange(9).reshape(3, 3)
print(x)
print(np.append(x, [[9], [10], [11]], axis=1))
In [276]:
## Insert Elements - 1D / Rank 1 Arrays
x = np.array([1, 2, 5, 6, 7, 8, 9, 10])
print(x)
print(np.insert(x, 2, [3, 4]))
In [277]:
## Insert Row at Specified Index - 2D Array
x = np.array([[1, 2, 3], [7, 8, 9]])
print(x)
print(np.insert(x, 1, [4, 5, 6], axis=0))
In [278]:
x = np.array([[1, 2], [4, 5]])
print(x)
print(np.insert(x, 2, [3, 6], axis=1))
print(np.insert(x, 2, 9, axis=1))
In [279]:
## Stack 2 Arrays - Vertically
x = np.array([1, 2])
y = np.array([[3, 4], [5, 6]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"vstack=\n {np.vstack((x, y))}")
In [280]:
## Stack 2 Arrays - Horizontally
x = np.array([[3], [6]])
y = np.array([[1, 2], [4, 5]])
print(f"x=\n{x}")
print(f"y=\n{y}")
print(f"hstack=\n {np.hstack((y, x))}")
Slicing only creates new "views" on the original array, not new copies of the sliced array. To create a copy, use the copy() method.
In [281]:
x = np.arange(1, 21).reshape(4, 5)
print(x)
print(x[0:2, 0:2])
## Notice the subtle difference between the followig
print(x[:, 0:1])
print(x[:, 0])
In [282]:
x = np.arange(25).reshape(5, 5)
print(x)
print(x[(x > 10) & (x < 17)])
In [283]:
x = np.array([1, 2, 3, 4, 5])
y = np.array([6, 8, 3, 2, 9])
print(np.intersect1d(x, y))
print(np.setdiff1d(x, y))
print(np.union1d(x, y))
In [284]:
x = np.random.randint(1, 11, size=(10, ))
print(x)
## Out-of-place sorting
print(f"oop sorted= \n {np.sort(x)}")
print(f"original= \n {x}")
## In-place sorting
x.sort()
print(f"ip sorted= \n {x}")
In [285]:
import torch
import numpy as np
a = np.random.rand(4,3)
a
Out[285]:
In [286]:
b = torch.from_numpy(a)
b
Out[286]:
In [287]:
b.numpy()
Out[287]:
In [288]:
import pandas as pd
In [289]:
display = pd.options.display
display.max_rows=10
display.max_columns=10
In [290]:
### With default integer indices
groceries = pd.Series(data=[30, 6, 'Foo', 'Bar'])
print(groceries)
### With custom indices
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries)
In [291]:
print(groceries.shape)
print(groceries.ndim)
print(groceries.size)
print(groceries.index)
print(groceries.values)
print('bananas' in groceries)
print('apples' in groceries)
In [292]:
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries['egg'])
print('====\n')
## By labels
print(groceries[['egg', 'apples']])
print(groceries.loc[['egg', 'apples']])
print('====\n')
## By index
print(groceries[[0, -1]])
print(groceries.iloc[[0, -1]])
In [293]:
## Change Element Values
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries[['egg']] = 31
print(groceries)
In [294]:
## Drop Elements - Out-of-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
print(groceries.drop(['apples']))
print(groceries)
In [295]:
## Drop Elements - In-Place
groceries = pd.Series(data=[30, 6, 'Yes', 'No'], index=['egg', 'apples', 'milk', 'bread'])
groceries.drop(['apples'], inplace=True)
print(groceries)
In [296]:
fruits= pd.Series(data = [10, 6, 3,], index = ['apples', 'oranges', 'bananas'])
fruits + 1
Out[296]:
In [297]:
np.sqrt(fruits)
Out[297]:
In [298]:
fruits[['bananas', 'oranges']] * 10
Out[298]:
In [299]:
# We create a dictionary of Pandas Series
items = {'Bob' : pd.Series(data = [245, 25, 55]),
'Alice' : pd.Series(data = [40, 110, 500, 45])}
# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts
Out[299]:
In [300]:
## Create DF from csv
# df = pd.read_csv('myfile.csv')
In [301]:
# We create a dictionary of Pandas Series
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
# We print the type of items to see that it is a dictionary
print(type(items))
shopping_carts = pd.DataFrame(items)
shopping_carts
Out[301]:
In [302]:
## Creating DF Using Subset of Dict
# We Create a DataFrame that only has selected items for Alice
alice_sel_shopping_cart = pd.DataFrame(items, index = ['glasses', 'bike'], columns = ['Alice'])
alice_sel_shopping_cart
Out[302]:
In [303]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants'])}
shopping_carts = pd.DataFrame(items)
shopping_carts.shape
Out[303]:
In [304]:
shopping_carts.ndim
Out[304]:
In [305]:
shopping_carts.columns
Out[305]:
In [306]:
shopping_carts.values
Out[306]:
In [307]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
Out[307]:
In [308]:
df[['Bob', 'Alice']]
Out[308]:
In [309]:
df.loc[:, ['Bob', 'Alice']]
Out[309]:
In [339]:
print(df)
df.iloc[:, [0, 2]]
Out[339]:
In [ ]:
df.loc[['bike', 'pants']]
In [ ]:
display(df)
df.iloc[[0, 2]]
In [ ]:
df[:3]
In [ ]:
df.sample(n=2)
In [ ]:
df['Alice']['bike'] # Column label always comes first
In [ ]:
display(df)
df.loc[df['Bob'] > 40]
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
In [ ]:
df['Dan'] = [1, 2, 3, 4, 5]
df
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df
items_new = {'Dan' : pd.Series(data = [1, 2, 3], index = ['bike', 'pants', 'watch']),}
df_new = pd.DataFrame(items_new)
df.join(df_new)
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
df.insert(1, 'Dan', [1, 2, 3, 4, 5])
df
In [ ]:
df['Total'] = df['Bob'] + df['Alice'] + df['Charlie'] + df['Dan']
df
In [ ]:
new_item = {'Bob': 1, 'Alice': 2, 'Charlie': 2}
new_df = pd.DataFrame(new_item, index = ['phones'])
display(new_df)
display(df.append(new_df))
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
display(df)
df.pop('Bob')
display(df)
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450], index = ['book', 'glasses', 'bike', 'pants'])}
df = pd.DataFrame(items)
display(df)
display(df.drop(['Bob', 'Alice'], axis=1)) # 1 = columns
In [ ]:
display(df.drop(['watch', 'book'], axis=0)) # axis=0 => row / index
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}
df = pd.DataFrame(items)
display(df)
columns_to_tranform = ['Bob', 'Charlie']
df[columns_to_tranform] = df[columns_to_tranform].apply(lambda x: x * 100)
display(df)
In [ ]:
#### Substitute values in all columns of a DF
df.replace([40, 7000], ['Foo', 'Bar'])
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500, 45], index = ['book', 'glasses', 'bike', 'pants']),
'Charlie': pd.Series(data = [45, 90, 70, 450, 1], index = ['book', 'glasses', 'bike', 'pants', 'watch'])}
df = pd.DataFrame(items)
display(df)
In [ ]:
## Counting NaNs
df.isnull().sum().sum()
In [ ]:
## Counting NaNs per column
df.isnull().sum()
In [ ]:
## Plot NaN count per column
import matplotlib.pyplot as plt
%matplotlib inline
df_nans = df.isnull().sum()
plt.bar(x=df_nans.index, height=df_nans.values)
plt.xticks(rotation = 90);
In [ ]:
## Counting non-NaNs
df.count().sum()
In [ ]:
## Drop rows with NaNs
display(df.dropna(axis=0))
In [ ]:
## Drop columns with NaNs
display(df.dropna(axis=1))
In [ ]:
## Replace all NaNs with 0
display(df.fillna(0))
In [ ]:
## Forward fill NaNs (value of previous row)
display(df)
display(df.fillna(method='ffill', axis=0)) # Other methods = 'backfill', 'linear'. Axis can be 1
In [ ]:
items = {'Bob' : pd.Series(data = [245, 25, 55], index = ['bike', 'pants', 'watch']),
'Alice' : pd.Series(data = [40, 110, 500], index = ['bike', 'pants', 'watch']),
'Charlie': pd.Series(data = [45, 90, 70], index = ['bike', 'pants', 'watch'])}
df = pd.DataFrame(items)
display(df)
In [ ]:
## Describe statistical information of DF
df.describe()
In [ ]:
df['Bob'].describe()
In [ ]:
df.mean()
In [ ]:
df.max()
In [312]:
import pandas as pd
import seaborn as sb
df = pd.read_csv('pokemon.csv')
df.head()
Out[312]:
In [313]:
sb.countplot(data = df, x = 'generation_id')
plt.title('My plot')
Out[313]:
In [314]:
## Single color bars
base_color = sb.color_palette()[0]
sb.countplot(data = df, x = 'generation_id', color = base_color)
Out[314]:
In [315]:
## Sort left to right
gen_order = df['generation_id'].value_counts().index
sb.countplot(data = df, x = 'generation_id', order = gen_order)
Out[315]:
In [316]:
## Rotate x tick labels
## Without rotation
sb.countplot(data = df, x = 'type_1')
Out[316]:
In [317]:
## With rotation
import matplotlib.pyplot as plt
plt.xticks(rotation = 90)
sb.countplot(data = df, x = 'type_1')
Out[317]:
In [318]:
## Plot Y Bars
sb.countplot(data = df, y = 'type_1')
Out[318]:
In [319]:
import numpy as np
import matplotlib.pyplot as plt
n_points = df.shape[0]
max_count = df['generation_id'].value_counts().max()
max_percent = max_count / n_points
tick_props = np.arange(0, max_percent, 0.05)
tick_names = ['{:0.2f}'.format(v) for v in tick_props]
sb.countplot(data = df, x = 'generation_id')
plt.yticks(tick_props * n_points, tick_names)
plt.ylabel('proportion')
Out[319]:
In [320]:
df.isna().sum()
sb.barplot(df.isna().sum().index.values, df.isna().sum())
plt.xticks(rotation = 90)
Out[320]:
In [321]:
df.head()
Out[321]:
In [322]:
plt.hist(data = df, x = 'speed', bins = 20);
In [323]:
bins = np.arange(0, df['speed'].max()+5, 5)
plt.hist(data = df, x = 'speed', bins = bins);
In [324]:
sb.distplot(df['speed']);
In [325]:
sb.distplot(df['speed'], kde=False);
In [326]:
import matplotlib.pyplot as plt
plt.figure(figsize = [15, 5])
plt.subplot(1, 2, 1) # 1 row, 2 cols, subplot 1
sb.distplot(df['speed'], kde=False);
plt.subplot(1, 2, 2) # 1 row, 2 cols, subplot 2
sb.distplot(df['speed']);
In [327]:
plt.hist(data = df, x = 'height');
In [328]:
plt.hist(data = df, x = 'height');
plt.xlim((0, 2))
Out[328]:
In [329]:
## Original plot (with linear scale)
plt.hist(data = df, x = 'weight');
In [330]:
## Plots with log scales for x-axis
plt.figure(figsize = [15, 5])
plt.subplot(1, 2, 1)
sb.distplot(df['weight'], kde=False)
plt.xscale('log')
plt.subplot(1, 2, 2)
plt.hist(data = df, x = 'weight');
plt.xscale('log')
In [331]:
## Changing x range, whilst in log scale to better visualise data distribution
plt.xscale('log')
min = np.log10(df['weight'].min())
max = np.log10(df['weight'].max())
bins = 10 ** np.arange(min, max + 0.1, 0.1)
plt.hist(data = df, x = 'weight', bins = bins);
In [332]:
sb.pairplot(df, hue='generation_id');
In [333]:
df.head()
Out[333]:
In [334]:
import matplotlib.pyplot as plt
sb.catplot(x='type_1', kind='count', hue='generation_id', data=df, height=10, aspect=10/7.5);
# sb.countplot(x='type_1', hue='generation_id', data=df)
plt.xticks(rotation = 90);
In [335]:
plt.scatter(data=df, x='speed', y='type_1')
plt.xlabel('speed')
plt.ylabel('type_1')
Out[335]:
In [336]:
sb.regplot(data=df, x='speed', y='height')
Out[336]:
In [ ]: