In [ ]:
import numpy as np
import matplotlib.pyplot as plt# Load data
import pandas as pd
import numpy as np
%matplotlib inline
# Import data
df = pd.read_csv('/Users/annette/Desktop/IntroToDataScienceClass/Lesson1/Numpy and Pandas/TitanicData.csv')
plt.scatter(df.Fare,df.Age)
plt.xlabel('Fare')
plt.ylabel('Age')
Here we see that passangers in first class have a larger variation in fare and often pay more than third class.
In [ ]:
plt.scatter(df['Fare'][df['Pclass']==3],df['Age'][df['Pclass']==3],c='Green',label = 'Class 3')
plt.scatter(df['Fare'][df['Pclass']==1],df['Age'][df['Pclass']==1],c='Red',label = 'Class 1')
Box plots are useful for showing data spread as well as indicating whether a distribution is skewed and/or has potential unusual observations (outliers).
The lower edge of the box corresponds to the 25th percentile, the upper edge to the 75th percentile, and the line in the box corresponds to the 50th percentile (median). The delta between the 75th and 25th percentile is called the interquartile range (IQR). The wiskers can be assigned any value, but usually they are placed at 1.5*IQR. For symmetric distributions (e.g., bell-shaped curves) the median line will lie exatly in the middle of the box.
In [ ]:
df.boxplot(column='Age', by='Pclass', grid=False)
In [ ]:
# You can plot the data on the box plot to further visualize the data spread
df.boxplot(column='Age', by='Pclass', grid=False)
for i in [1,2,3]:
y = df.Age[df.Pclass==i].dropna()
x = i *np.ones(y.size)
plt.plot(x, y, 'r.', alpha=0.2)
In [ ]:
# When data is too tight, you can add some jitter (random noise) to better visualize.
df.boxplot(column='Age', by='Pclass', grid=False)
for i in [1,2,3]:
y = df.Age[df.Pclass==i].dropna()
# Add some random "jitter" to the x-axis
x = np.random.normal(i, 0.04, size=len(y))
plt.plot(x, y, 'r.', alpha=0.2)
In [ ]:
# The dataframe has built-in methods that allow you to make different kinds of plots.
# Number of people in each class
df.groupby('Pclass').PassengerId.count().plot(kind='bar')
In [ ]:
# You can use cross tabulations to look at joint frequency distributions
survivalClassInfo = pd.crosstab(df.Pclass, df.Survived)
survivalClassInfo
In [ ]:
# You can then visualize the cross tabulation via stacked bar charts
survivalClassInfo.plot(kind='bar', stacked=True, color=['Blue','Red'], grid=False)
In [ ]:
# Look at gender by class
genderClassInfo = pd.crosstab(df.Pclass, df.Sex)
# You can then visualize the cross tabulation via stacked bar charts
genderClassInfo.plot(kind='bar', stacked=True, color=['Blue','Red'], grid=False)
In [ ]:
# You can look at the data in terms of percent total.
genderClassInfo.div(genderClassInfo.sum(1).astype(float), axis=0).plot(kind='barh', stacked=True, color=['Blue','Red'])
You can visualize stats with bar charts
In [ ]:
df.groupby('Pclass')['Fare'].mean().plot(kind='bar', yerr=df.groupby('Pclass')['Fare'].std())
In [ ]:
# Here we are looking at the distribution of fares.
# The bin width = $50 dollars
df.Fare.hist(grid=False)
In [ ]:
# You can set the number of bins to get finer or coarser resolution.
df.Fare.hist(grid=False,bins = 30)
A density plot is similar to a histogram in that it describes the distribution of the underlying data, but rather than being a pure empirical representation, it is an estimate of the underlying "true" distribution. As a result, it is smoothed into a continuous line plot. We create them in Pandas using the plot method with kind='kde', where kde stands for kernel density estimate.
In [ ]:
df.Fare.plot(kind='kde', xlim=(0,600))
In [ ]:
# Age dsitribution by class
df.Age[df.Pclass == 1].dropna().plot(kind='kde',label = 'Class 1',legend = True)
df.Age[df.Pclass == 2].dropna().plot(kind='kde',label = 'Class 2',legend = True)
df.Age[df.Pclass == 3].dropna().plot(kind='kde',label = 'Class 3',legend = True)
plt.xlabel("Age")
plt.title("Age Distribution within classes")
In [ ]: