In [1]:
import pandas as pd
In [2]:
iris = pd.read_csv('datasets/iris.data', names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
print(iris.head())
In [3]:
wine_reviews = pd.read_csv('datasets/winemag-data-130k-v2.csv', index_col=0)
In [4]:
wine_reviews.head()
Out[4]:
In [5]:
import matplotlib.pyplot as plt
In [33]:
fig, ax = plt.subplots()
# scatter the sepal_length against the sepal_width
ax.scatter(iris['sepal_length'], iris['sepal_width'])
ax.set_title('Iris Dataset')
ax.set_xlabel('sepal_length')
ax.set_ylabel('sepal_width')
Out[33]:
In [8]:
# create a colors dict
colors = {'Iris-setosa': 'r', 'Iris-versicolor': 'g', 'Iris-virginica': 'b'}
fig, ax = plt.subplots()
# plot each points
for i in range(len(iris['sepal_length'])):
ax.scatter(iris['sepal_length'][i], iris['sepal_width'][i], color=colors[iris['class'][i]])
ax.set_title('Iris Dataset')
ax.set_xlabel('sepal_length')
ax.set_ylabel('sepal_width')
Out[8]:
In [20]:
# get columns to plot
columns = iris.columns.drop(['class'])
# create x data
x_data = range(0, iris.shape[0])
# create figure and axis
fig, ax = plt.subplots()
# plot each column
for column in columns:
ax.plot(x_data, iris[column], label=column)
ax.set_title('Iris dataset')
ax.legend()
Out[20]:
In [22]:
fig, ax = plt.subplots()
# plot histogram
ax.hist(wine_reviews['points'])
ax.set_title('Wine Review Scores')
ax.set_xlabel('Points')
ax.set_ylabel('Frequency')
Out[22]:
In [23]:
fig, ax = plt.subplots()
# count the occurrence of each class
data = wine_reviews['points'].value_counts()
# get x and y data
points = data.index
frequency = data.values
# create bar chart
ax.bar(points, frequency)
ax.set_title('Wine Review Scores')
ax.set_xlabel('Points')
ax.set_ylabel('Frequency')
Out[23]:
In [25]:
iris.plot.scatter(x='sepal_length', y='sepal_width', title='Iris dataset')
Out[25]:
In [27]:
iris.drop(['class'], axis=1).plot.line(title='Iris dataset')
Out[27]:
In [28]:
wine_reviews['points'].plot.hist()
Out[28]:
In [29]:
iris.plot.hist(subplots=True, layout=(2,2), figsize=(10,10), bins=20)
Out[29]:
In [30]:
wine_reviews['points'].value_counts().sort_index().plot.bar()
Out[30]:
In [31]:
wine_reviews['points'].value_counts().sort_index().plot.barh()
Out[31]:
In [32]:
wine_reviews.groupby('country').price.mean().sort_values(ascending=False)[:5].plot.bar()
Out[32]:
In [34]:
import seaborn as sns
In [37]:
sns.scatterplot(x='sepal_length', y='sepal_width', data=iris)
Out[37]:
In [36]:
sns.scatterplot(x='sepal_length', y='sepal_width', data=iris, hue='class')
Out[36]:
In [41]:
sns.lineplot(data=iris.drop(['class'], axis=1))
Out[41]:
In [46]:
# doesn't plot the gaussian kernel density estimate (kde)
sns.distplot(wine_reviews['points'], bins=10, kde=False)
Out[46]:
In [47]:
# plot the gaussian kernel density estimate (kde)
sns.distplot(wine_reviews['points'], bins=10, kde=True)
Out[47]:
In [48]:
sns.countplot(wine_reviews['points'])
Out[48]:
In [51]:
df = wine_reviews[(wine_reviews['points']>=95) & (wine_reviews['price']<1000)]
sns.boxplot('points', 'price', data=df)
Out[51]:
In [55]:
import numpy as np
# get correlation matrix
corr = iris.corr()
fig, ax = plt.subplots()
# create heatmap
im = ax.imshow(corr.values)
ax.set_xticks(np.arange(len(corr.columns)))
ax.set_yticks(np.arange(len(corr.columns)))
ax.set_xticklabels(corr.columns)
ax.set_yticklabels(corr.columns)
plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')
Out[55]:
In [56]:
import numpy as np
# get correlation matrix
corr = iris.corr()
fig, ax = plt.subplots()
# create heatmap
im = ax.imshow(corr.values)
ax.set_xticks(np.arange(len(corr.columns)))
ax.set_yticks(np.arange(len(corr.columns)))
ax.set_xticklabels(corr.columns)
ax.set_yticklabels(corr.columns)
plt.setp(ax.get_xticklabels(), rotation=45, ha='right',
rotation_mode='anchor')
# loop over data dimensions and create text annotations
for i in range(len(corr.columns)):
for j in range(len(corr.columns)):
text = ax.text(j, i, np.around(corr.iloc[i, j], decimals=2),
ha="center", va="center", color="black")
In [57]:
sns.heatmap(iris.corr(), annot=True)
Out[57]:
In [58]:
g = sns.FacetGrid(iris, col='class')
g = g.map(sns.kdeplot, 'sepal_length')
In [59]:
sns.pairplot(iris)
Out[59]:
In [61]:
from pandas.plotting import scatter_matrix
fig, ax = plt.subplots(figsize=(12,12))
scatter_matrix(iris, alpha=1, ax=ax)
Out[61]:
In [ ]: