In [7]:
# seaborn is a powerful data visualization tool#
# updated Jul 28, 2017#
import seaborn as sns
import matplotlib.pyplot as plt
In [8]:
iris = sns.load_dataset("iris") # output is pandas data frame
print iris.head()
In [10]:
sns.pairplot(iris, hue="species")
plt.show()
In [13]:
#histogram of a column
sns.distplot(iris['sepal_length'], kde=False, color="b", bins = 20)
plt.show()
In [18]:
# histogram of a column, separate by species
print iris['species'].unique()
ax = sns.distplot(iris.loc[iris['species'] == 'setosa','sepal_length'], kde=False, color="b", bins = 20)
plt.show()
In [23]:
fig, ax = plt.subplots()
sns.distplot(iris.loc[iris['species'] == 'setosa','sepal_length'], kde=False, color="b", bins = 20, ax = ax)
sns.distplot(iris.loc[iris['species'] == 'versicolor','sepal_length'], kde=False, color="r", bins = 20, ax = ax)
sns.distplot(iris.loc[iris['species'] == 'virginica','sepal_length'], kde=False, color="c", bins = 20, ax = ax)
plt.legend(iris['species'].unique())
plt.ylabel('relative frequency')
plt.show()
In [26]:
# separate plot
fig, ax = plt.subplots()
ax1 = plt.subplot(311)
ax2 = plt.subplot(312)
ax3 = plt.subplot(313)
sns.distplot(iris.loc[iris['species'] == 'setosa','sepal_length'], kde=False, color="b", bins = 20, ax = ax1)
sns.distplot(iris.loc[iris['species'] == 'versicolor','sepal_length'], kde=False, color="r", bins = 20, ax = ax2)
sns.distplot(iris.loc[iris['species'] == 'virginica','sepal_length'], kde=False, color="c", bins = 20, ax = ax3)
plt.legend(iris['species'].unique())
plt.ylabel('relative frequency')
plt.show()
In [29]:
# box plot
sns.boxplot(x="species", y="sepal_length", data=iris, palette="PRGn")
plt.show()
In [ ]:
In [61]:
from scipy import stats
# linear regression with all data
slope, intercept, r_value, p_value, std_err = stats.linregress(iris['sepal_length'],iris['sepal_width'])
print slope, intercept, r_value, p_value, std_err
sns.jointplot(x="sepal_length", y="sepal_width", data=iris, kind="reg")
plt.show()
In [62]:
# linear regression for species = 'setosa'
slope, intercept, r_value, p_value, std_err = stats.linregress(iris.loc[iris['species']=='setosa', 'sepal_length'], \
iris.loc[iris['species']=='setosa', 'sepal_width'])
print slope, intercept, r_value, p_value, std_err
# Show the results of a linear regression within each species
g = sns.lmplot(x="sepal_length", y="sepal_width", hue="species", data=iris,
palette="muted", scatter_kws={"s": 50, "alpha": 1})
plt.show()
In [ ]: