In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np
import scipy.stats as ss
import warnings
warnings.filterwarnings("ignore")
sns.set_style('white')
%matplotlib inline
In [2]:
x = np.array([1, 1, 1,1, 10, 100, 1000])
y = np.array([1000, 100, 10, 1, 1, 1, 1])
ratio = x/y
print(ratio)
Plot on the linear scale using the scatter()
function.
In [3]:
plt.scatter( np.arange(len(ratio)), ratio, s=100 )
plt.plot( [0,len(ratio)], [1,1], color='k', linestyle='--', linewidth=.5 ) # plot the line ratio = 1
Out[3]:
Plot on the log scale.
In [4]:
plt.scatter( np.arange(len(ratio)), ratio, s=100 )
plt.yscale('log')
plt.ylim( (0.0001,10000) ) # set the scope the y axis
plt.plot( [0,len(ratio)], [1,1], color='k', linestyle='--', linewidth=.5 )
Out[4]:
What do you see from the two plots? Why do we need to use log scale to visualize ratios?
Let's practice this using random numbers. Generate 10 random numbers between [0,1], calculate the ratios between two consecutive numbers (the second number divides by the first, and so on), and plot the ratios on the linear and log scale.
In [5]:
# TODO: generate random numbers and calculate ratios between two consecutive numbers
x = np.random.rand(10)
print(x)
ratio = [ i/j for i,j in zip(x[1:],x[:-1]) ]
print(ratio)
In [6]:
# TODO: plot the ratios on the linear scale
plt.scatter( np.arange(len(ratio)), ratio, s=100 )
plt.plot( [0,len(ratio)], [1,1], color='k', linestyle='--', linewidth=.5 )
Out[6]:
In [7]:
# TODO: plot the ratios on the log scale
plt.scatter( np.arange(len(ratio)), ratio, s=100 )
plt.yscale('log')
plt.plot( [0,len(ratio)], [1,1], color='k', linestyle='--', linewidth=.5 )
Out[7]:
In [8]:
# TODO: plot the histogram of movie votes
movie_df = pd.read_csv('imdb.csv', delimiter='\t')
plt.hist(movie_df['Votes'])
Out[8]:
As we can see, most votes fall in the first bin, and we cannot see the values from the second bin. How about plotting on the log scale?
In [9]:
# TODO: change the y scale to log
plt.hist(movie_df['Votes'])
plt.yscale('log')
Change the number of bins to 1000.
In [10]:
# TODO: set the bin number to 1000
plt.hist(movie_df['Votes'], bins=1000)
plt.yscale('log')
Now, let's try log-bin. Recall that when plotting histgrams we can specify the edges of bins through the bins
parameter. For example, we can specify the edges of bins to [1, 2, 3, ... , 10] as follows.
In [11]:
plt.hist( movie_df['Rating'], bins=range(0,11) )
Out[11]:
Here, we can specify the edges of bins in a similar way. Instead of specifying on the linear scale, we do it on the log space. Some useful resources:
Hint: since $10^{\text{start}} = \text{min_votes}$, $\text{start} = \log_{10}(\text{min_votes})$
In [12]:
# TODO: specify the edges of bins using np.logspace
bins = np.logspace( np.log10(min(movie_df['Votes'])), np.log10(max(movie_df['Votes'])), 20)
Now we can plot histgram with log-bin.
In [13]:
plt.hist(movie_df['Votes'], bins=bins)
plt.xscale('log')
In [14]:
# TODO: correct the plot
plt.hist(movie_df['Votes'], bins=bins, normed=True)
plt.xscale('log')
plt.yscale('log')
In [15]:
movie_df = pd.read_csv('imdb.csv', delimiter='\t')
movie_df.head()
Out[15]:
We can plot histogram and KDE using pandas:
In [16]:
movie_df['Rating'].hist(bins=10, normed=True)
movie_df['Rating'].plot(kind='kde')
Out[16]:
Or using seaborn:
In [17]:
sns.distplot(movie_df['Rating'], bins=10)
Out[17]:
Can you plot the histogram and KDE of the log of movie votes?
In [18]:
# TODO: implement this using pandas
logs = np.log(movie_df['Votes'])
logs.hist(bins=10, normed=True)
logs.plot(kind='kde')
plt.xlim(0, 25)
Out[18]:
In [19]:
# TODO: implement this using seaborn
sns.distplot(logs, bins=10)
Out[19]:
In [20]:
f = plt.figure(figsize=(15,8))
plt.xlim(0, 10)
sample_sizes = [10, 50, 100, 500, 1000, 10000]
for i, N in enumerate(sample_sizes, 1):
plt.subplot(2,3,i)
plt.title("Sample size: {}".format(N))
for j in range(5):
s = movie_df['Rating'].sample(N)
sns.kdeplot(s, kernel='gau', legend=False)
Remember Anscombe's quartet? Let's plot the four datasets and do linear regression, which can be done with scipy's linregress()
function.
TODO: display the fitted equations using the text()
function.
In [21]:
X1 = [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]
Y1 = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]
X2 = [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]
Y2 = [9.14, 8.14, 8.74, 8.77, 9.26, 8.10, 6.13, 3.10, 9.13, 7.26, 4.74]
X3 = [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]
Y3 = [7.46, 6.77, 12.74, 7.11, 7.81, 8.84, 6.08, 5.39, 8.15, 6.42, 5.73]
X4 = [8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 19.0, 8.0, 8.0, 8.0]
Y4 = [6.58, 5.76, 7.71, 8.84, 8.47, 7.04, 5.25, 12.50, 5.56, 7.91, 6.89]
data = [ (X1,Y1),(X2,Y2),(X3,Y3),(X4,Y4) ]
plt.figure(figsize=(10,8))
for i,p in enumerate(data, 1):
X, Y = p[0], p[1]
plt.subplot(2, 2, i)
plt.scatter(X, Y, s=30, facecolor='#FF4500', edgecolor='#FF4500')
slope, intercept, r_value, p_value, std_err = ss.linregress(X, Y)
plt.plot([0, 20], [intercept, slope*20+intercept], color='#1E90FF') #plot the fitted line Y = slope * X + intercept
# TODO: display the fitted equations using the text() function.
plt.text(2, 11, r'$Y = {:1.2f} \cdot X + {:1.2f}$'.format(slope,intercept))
plt.xlim(0,20)
plt.xlabel('X'+str(i))
plt.ylabel('Y'+str(i))
Actually, the dataset is included in seaborn and we can load it.
In [22]:
df = sns.load_dataset("anscombe")
df.head()
Out[22]:
All four datasets are in this single data frame and the 'dataset' indicator is one of the columns. This is a form often called tidy data, which is easy to manipulate and plot. In tidy data, each row is an observation and columns are the properties of the observation. Seaborn makes use of the tidy form.
We can show the linear regression results for each eadataset. Here is the example:
In [23]:
sns.lmplot(x="x", y="y", col="dataset", hue="dataset", data=df,
col_wrap=2, ci=None, palette="muted", size=4,
scatter_kws={"s": 50, "alpha": 1})
Out[23]:
What do these parameters mean? The documentation for the lmplot()
is here.
In [24]:
sns.lmplot(x="y", y="x", col="dataset", hue="dataset", data=df,
col_wrap=2, ci=None, palette="muted", size=4,
scatter_kws={"s": 25, "alpha": 0.8})
Out[24]:
In [25]:
geq = movie_df['Year'] >= 1990
leq = movie_df['Year'] <= 1999
subset = movie_df[ geq & leq ]
subset.head()
Out[25]:
We can draw a scatter plot of movie votes and ratings using the scatter()
function.
In [26]:
plt.scatter(subset['Votes'], subset['Rating'])
plt.xlabel('Votes')
plt.ylabel('Rating')
Out[26]:
Too many data points. We can decrease symbol size, set symbols empty, and make them transparent.
In [27]:
plt.scatter(subset['Votes'], subset['Rating'], s=20, alpha=0.6, facecolors='none', edgecolors='b')
plt.xlabel('Votes')
plt.ylabel('Rating')
Out[27]:
Number of votes is broadly distributed. So set the x axis to log scale.
In [28]:
plt.scatter(subset['Votes'], subset['Rating'], s=10, alpha=0.6, facecolors='none', edgecolors='b')
plt.xscale('log')
plt.xlabel('Votes')
plt.ylabel('Rating')
Out[28]:
We can combine scatter plot with 1D histogram using seaborn's jointplot()
function.
In [29]:
sns.jointplot(np.log(subset['Votes']), subset['Rating'])
Out[29]:
There are too many data points. We need to bin them, which can be done by using the jointplot()
and setting the kind
parameter.
In [30]:
# TODO: draw a joint plot with hexbins and two histograms for each marginal distribution
sns.jointplot(np.log(subset['Votes']), subset['Rating'], kind='hexbin')
Out[30]:
We can also do 2D KDE using seaborn's kdeplot()
function.
In [31]:
sns.kdeplot(np.log(subset['Votes']), subset['Rating'], cmap="Reds", shade=True, shade_lowest=False)
Out[31]:
Or using jointplot()
by setting the kind
parameter.
In [32]:
# TODO: draw a joint plot with bivariate KDE as well as marginal distributions with KDE
sns.jointplot(np.log(subset['Votes']), subset['Rating'], kind='kde', shade_lowest=False)
Out[32]: