Visualization strategy:
- Single variable
- numeric continuous variable
- histogram: distribution of values
- boxplot: outlier analysis
- Categorical (string or discrete numeric)
- frequency plot
- Association plot
- continuous vs continuous: scatter plot
- continuous vs categorical: vertical bar and boxplot (regression problems)
- categorical vs continuous: horizontal bar (classification problems)
- categorical vs categorical: heapmap
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.mlab import normpdf
%matplotlib inline
plt.rcParams['figure.figsize'] = 10, 6
In [2]:
df = pd.read_csv("http://www-bcf.usc.edu/~gareth/ISL/Auto.data", sep=r"\s+")
df.head(10)
Out[2]:
In [3]:
df.info()
In [4]:
df["year"].unique()
Out[4]:
In [5]:
df.sample(10)
Out[5]:
In [6]:
plt.hist(df["mpg"], bins = 30)
plt.title("Histogram plot of mpg")
plt.xlabel("mpg")
plt.ylabel("Frequency")
Out[6]:
In [7]:
plt.boxplot(df["mpg"])
plt.title("Boxplot of mpg\n ")
plt.ylabel("mpg")
Out[7]:
In [8]:
#plt.figure(figsize = (10, 6))
plt.subplot(2, 1, 1)
n, bins, patches = plt.hist(df["mpg"], bins = 50, normed = True)
plt.title("Histogram plot of mpg")
plt.xlabel("MPG")
pdf = normpdf(bins, df["mpg"].mean(), df["mpg"].std())
plt.plot(bins, pdf, color = "red")
plt.subplot(2, 1, 2)
plt.boxplot(df["mpg"], vert=False)
plt.title("Boxplot of mpg")
plt.tight_layout()
plt.xlabel("MPG")
Out[8]:
In [9]:
normpdf(bins, df["mpg"].mean(), df["mpg"].std())
Out[9]:
In [10]:
# using pandas plot function
plt.figure(figsize = (10, 6))
df.mpg.plot.hist(bins = 50, normed = True)
plt.title("Histogram plot of mpg")
plt.xlabel("mpg")
Out[10]:
In [11]:
counts = df["year"].value_counts().sort_index()
In [12]:
plt.figure(figsize = (10, 4))
plt.bar(range(len(counts)), counts, align = "center")
plt.xticks(range(len(counts)), counts.index)
plt.xlabel("Year")
plt.ylabel("Frequency")
plt.title("Frequency distribution by year")
Out[12]:
In [13]:
plt.figure(figsize = (10, 4))
df.year.value_counts().sort_index().plot.bar()
Out[13]:
In [14]:
corr = np.corrcoef(df["weight"], df["mpg"])[0, 1]
plt.scatter(df["weight"], df["mpg"])
plt.xlabel("Weight")
plt.ylabel("Mpg")
plt.title("Mpg vs Weight, correlation: %.2f" % corr)
Out[14]:
Scatter plot using pandas dataframe plot function
In [15]:
df.plot.scatter(x= "weight", y = "mpg")
plt.title("Mpg vs Weight, correlation: %.2f" % corr)
Out[15]:
In [16]:
mpg_by_year = df.groupby("year")["mpg"].agg([np.median, np.std])
mpg_by_year.head()
Out[16]:
In [17]:
mpg_by_year["median"].plot.bar(yerr = mpg_by_year["std"], ecolor = "red")
plt.title("MPG by year")
plt.xlabel("year")
plt.ylabel("MPG")
Out[17]:
Show the boxplot of MPG by year
In [18]:
plt.figure(figsize=(10, 5))
sns.boxplot("year", "mpg", data = df)
Out[18]:
In [19]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), cmap=sns.color_palette("RdBu", 10), annot=True)
Out[19]:
In [20]:
plt.figure(figsize=(10, 8))
aggr = df.groupby(["year", "cylinders"])["mpg"].agg(np.mean).unstack()
sns.heatmap(aggr, cmap=sns.color_palette("Blues", n_colors= 10), annot=True)
Out[20]:
In [21]:
iris = pd.read_csv("https://raw.githubusercontent.com/abulbasar/data/master/iris.csv")
iris.head()
Out[21]:
In [22]:
fig, ax = plt.subplots()
x1, x2 = "SepalLengthCm", "PetalLengthCm"
cmap = sns.color_palette("husl", n_colors=3)
for i, c in enumerate(iris.Species.unique()):
iris[iris.Species == c].plot.scatter(x1, x2, color = cmap[i], label = c, ax = ax)
plt.legend()
Out[22]:
In [23]:
import scipy.stats as stats
p = stats.probplot(df["mpg"], dist="norm", plot=plt)
In [ ]: