In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
In [2]:
from scipy import stats
The data set we will use for this exercise comes from the featured Kaggle datasets, containing data about cold calls to both existing and potential car insurance customers. The data provides some demographic data about each person called (customer), such as age, job, marital status and education, along with call details and some features indicating insurance status.
The goal here is to perform exploratory data analysis (EDA), using both visual EDA and some statistics and probability.
For statistics we will use the Chi-square test of independence, which is meant to be used with categorical data. In this case we will be concerned with the effects of some demographic features on the outcome of the calls.
Exponential distributions have the probability density funtion
$$f(x; \lambda) = \begin{cases} \lambda e^{-\lambda x} &\text{ } x \geq 0\\ 0 &\text{ } x < 0 \end{cases}$$and cumulative density function
$$F(x; \lambda) = 1 - e^{-\lambda x}.$$The expectation value (mean) is $\lambda^{-1}$ and percentiles can be computed using
$$F^{-1}(p; \lambda) = \frac{-ln(1-p)}{\lambda}$$Where p is the desired percentile. Since we can easily compute the mean of our data, we can use that to get lamba and thus compute percentiles.
In our case, we can easily compute the mean of our data using pandas:
Chi-square tests are used to compare two categorical variables from a single population; they are used to determine if there is a significant associtation between them. The test has three requirements:
For this set we will assume the second requirement has been met.
In [28]:
insurance = pd.read_csv('data/car_insurance.csv')
# very useful from the solution: add parse_dates=list_of_dates to have some columns already in datetime format!
In [4]:
insurance.head()
Out[4]:
In [5]:
insurance.shape
Out[5]:
In [6]:
insurance.info()
In [7]:
insurance.describe(include='all').T
Out[7]:
In [8]:
# print(min(pd.to_datetime(insurance.CallStart)), max(pd.to_datetime(insurance.CallStart)))
# print(min(pd.to_datetime(insurance.CallEnd)), max(pd.to_datetime(insurance.CallEnd)))
insurance['call_duration'] = (pd.to_datetime(insurance.CallEnd) - pd.to_datetime(insurance.CallStart)).astype('timedelta64[s]')
In [9]:
insurance.call_duration.head()
Out[9]:
In [10]:
insurance[['CarInsurance', 'call_duration']].corr()
Out[10]:
The variables doesn't seem to be highly correlated.
In [12]:
fig = plt.figure(figsize=(16, 10))
ax = sns.boxplot(x='CarInsurance', y='call_duration', data=insurance);
In the solutions a t-test is performed, but it isn't very useful so I'm not reproducing it.
In [14]:
fig = plt.figure(figsize=(16, 8))
ax = sns.distplot(insurance.call_duration, bins=25, kde=False);
In [15]:
insurance.call_duration.mean()
Out[15]:
In [16]:
l = insurance.call_duration.mean()**-1
mean_est = l**-1
median_est = -np.log(1-0.5)/l
perc95_est = -np.log(1-0.95)/l
print('Estimated mean, median and 95th percentile: {:.5f}, {:.5f}, {:.5f}'.format(
mean_est,
median_est,
perc95_est))
In [17]:
print('Mean, median and 95th percentile: {:.5f}, {:.5f}, {:.5f}'.format(
insurance.call_duration.mean(),
insurance.call_duration.median(),
insurance.call_duration.quantile(0.95)))
In [18]:
print('Has car insurance mean, median and 95th percentile: {:.5f}, {:.5f}, {:.5f}'.format(
insurance[insurance.CarInsurance == 1].call_duration.mean(),
insurance[insurance.CarInsurance == 1].call_duration.median(),
insurance[insurance.CarInsurance == 1].call_duration.quantile(0.95)))
print('Doesn\'t have car insurance mean, median and 95th percentile: {:.5f}, {:.5f}, {:.5f}'.format(
insurance[insurance.CarInsurance == 0].call_duration.mean(),
insurance[insurance.CarInsurance == 0].call_duration.median(),
insurance[insurance.CarInsurance == 0].call_duration.quantile(0.95)))
In [19]:
for col in insurance.columns:
if set(insurance[col].unique()) == set([0, 1]):
print(col)
In [20]:
from scipy.special import binom as binom
p = insurance.CarInsurance.sum()/insurance.CarInsurance.count()
print(binom(10, 3) * p**3 * (1-p)**7)
print(stats.binom.pmf(3, n=10, p=p))
In [21]:
print(stats.binom.cdf(3, n=10, p=p))
In [29]:
def age_cat(df):
if df.Age >= 65:
return '65+'
elif df.Age >= 55:
return '55-64'
elif df.Age >= 45:
return '45-54'
elif df.Age >= 35:
return '35-44'
elif df.Age >= 25:
return '25-34'
elif df.Age >= 18:
return '18-24'
else:
return('Something strange')
In [30]:
insurance['age_categories'] = insurance.apply(age_cat, axis=1)
In [59]:
# fig = plt.figure(figsize=(16, 6))
# ax = plt.axes()
# ax = sns.barplot(x='index', y='age_categories', stacked=True, data=insurance.age_categories.value_counts().reset_index())
# ax.set_xlabel('Age Categories')
# ax.set_ylabel('Count');
ax = pd.DataFrame(insurance.age_categories.value_counts()).T.plot.barh(figsize=(16, 6), stacked=True)
In [37]:
fig = plt.figure(figsize=(16, 6))
ax = plt.axes()
ax = sns.distplot(insurance.Age);
The distribution seems normal but it is a little skewed to the right and it has two other peaks, it could be closer to a bimodal distribution.
In [61]:
print(insurance.Age.describe())
print(insurance.Age.skew())
print(insurance.Age.kurtosis())
In [55]:
edu_contingency = pd.pivot_table(insurance, values='Id', index='Education', columns='Outcome', aggfunc='count', margins=True)
ins_contingency = pd.pivot_table(insurance, values='Id', index='CarInsurance', columns='Outcome', aggfunc='count', margins=True)
job_contingency = pd.pivot_table(insurance, values='Id', index='Job', columns='Outcome', aggfunc='count', margins=True)
age_contingency = pd.pivot_table(insurance, values='Id', index='age_categories', columns='Outcome', aggfunc='count', margins=True)
In [59]:
print('Education vs. Outcome:\n\n{}\n\n'.format(edu_contingency))
print('Car insurance vs. Outcome:\n\n{}\n\n'.format(ins_contingency))
print('Job vs. Outcome:\n\n{}\n\n'.format(job_contingency))
print('Age category vs. Outcome:\n\n{}\n\n'.format(age_contingency))
In [67]:
def contingency_heatmap(df):
data = df.drop('All').drop('All', axis=1) / df.loc['All', 'All']
fig = plt.figure(figsize=(10, 6))
ax = sns.heatmap(data)
In [68]:
contingency_heatmap(edu_contingency)
contingency_heatmap(ins_contingency)
contingency_heatmap(job_contingency)
contingency_heatmap(age_contingency)
In [70]:
stats.chi2_contingency(edu_contingency.drop('All').drop('All', axis=1))
Out[70]:
In [72]:
stats.chi2_contingency(ins_contingency.drop('All').drop('All', axis=1))
Out[72]:
In [73]:
stats.chi2_contingency(job_contingency.drop('All').drop('All', axis=1))
Out[73]:
In [74]:
stats.chi2_contingency(age_contingency.drop('All').drop('All', axis=1))
Out[74]:
Answer: the job table because it has counts under 5 in some cells.
In [77]:
stats.chi2_contingency(edu_contingency.drop('All').drop(['All', 'other'], axis=1))
Out[77]:
In [78]:
stats.chi2_contingency(ins_contingency.drop('All').drop(['All', 'other'], axis=1))
Out[78]:
In [79]:
stats.chi2_contingency(job_contingency.drop('All').drop(['All', 'other'], axis=1))
Out[79]:
In [80]:
stats.chi2_contingency(age_contingency.drop('All').drop(['All', 'other'], axis=1))
Out[80]:
The job table still isn't good:
In [81]:
job_contingency.drop('All').drop(['All', 'other'], axis=1)
Out[81]:
Thus, from the tests we can say that the Outcome variable is dependent from Education, CarInsurance and age_categories.
This data set comes from the featured Kaggle datasets, containing three tables that relate to commercial airline flights. The flight delay and cancellation data was collected and published by the DOT's Bureau of Transportation Statistics.
There are three tables:
airports : contains IATA_CODE: Location Identifier String
AIRPORT: Airport's Name String
CITY: City Name of the Airport String
STATE: State Name of the Airport String
COUNTRY: Country Name of the Airport String
LATITUDE: Latitude of the Airport Numeric
LONGITUDE: Longitude of the Airport Numeric
We cleaned this data in the cleaning notebook, and recall the mention that we started with a sample of the full set available on Kaggle, in order to reduce processing times.
The Poisson distribution represents the probability of a given number of events occurring in a fixed interval of time or space if these events occur with a known average rate and independently of the time since the last event. This comment about independence is actually referring to the memoryless property of the exponential distribution (a distribtuion introduced earlier). Put another way, the Poisson distribution represents a process of a series of events that each follow the same exponential distribution.
The Poisson probability formula:
$$P(x; \mu) = e^{-\mu}\frac{\mu^{x}}{x!}$$where $x$ is the number of successes and $\mu$ is the mean of the distribution.
Log transformations can be useful for a number of reasons; two reasons include scaling data for easier interpretation or transforming data to fit a distribution that can be modeled (e.g. making exponential data linear).
Example (notice how x-values beyond 10 are off the charts, but ln(x) is at visible scales out to 1000 and beyond):
apply() method to create a table that contains the following stats for each airline:
In [3]:
flights = pd.read_csv('data/flights_cleaned_sm.csv')
In [63]:
flights.info()
In [4]:
airlines = pd.read_csv('data/airlines_cleaned.csv')
In [65]:
flights_volume = flights.groupby('airline')[['cancelled', 'diverted']].mean().merge(
pd.DataFrame(flights.airline.value_counts()),
left_index=True,
right_index=True)
flights_volume.rename(columns={'airline':'flight_volume', 'diverted':'diverted_rate', 'cancelled':'cancelled_rate'}, inplace=True)
flights_volume['flight_proportion'] = flights_volume.flight_volume.apply(lambda x: x/len(flights))
flights_volume
# from solution:
# airline_stats = flights.groupby('airline').apply(
# lambda x: Series({
# 'flight_vol': x['airline'].count(),
# 'flight_prop': x['airline'].count() / len(flights),
# 'diverted_rate': x['diverted'].mean(),
# 'cancelled_rate': x['cancelled'].mean()}))
Out[65]:
In [7]:
volume_airline_airport = flights.groupby(['airline', 'origin_airport']).year.count()\
.reset_index()\
.merge(airlines, left_on='airline', right_on='iata_code', how='left')\
.drop(['iata_code', 'airline_x'], axis=1)\
.rename(columns={'origin_airport':'airport', 'airline_y':'airline', 'year':'volume'})
In [8]:
fig = plt.figure(figsize=(16, 16))
ax = sns.barplot(y='airport', x='volume', data=volume_airline_airport.groupby('airport').sum().reset_index().sort_values(by='volume', ascending=False).head(20))
ax.set_title('Volume by airport');
In [9]:
fig = plt.figure(figsize=(16, 8))
ax = sns.barplot(x='airline', y='volume', data=volume_airline_airport.groupby('airline').sum().reset_index())
ax.set_title('Volume by airline')
plt.xticks(rotation=45);
In [10]:
fig = plt.figure(figsize=(16, 48))
#ax = plt.axes()
sns.heatmap(volume_airline_airport.pivot_table(values='volume', index='airport', columns='airline', aggfunc='sum', fill_value=0))
Out[10]:
In [11]:
print(volume_airline_airport.groupby('airline').sum()
.reset_index()
.sort_values(by='volume', ascending=False)
.iloc[0, 0])
print(volume_airline_airport[volume_airline_airport.airline=='Southwest Airlines Co.'].groupby('airport').sum()
.reset_index()
.sort_values(by='volume', ascending=False)
.iloc[0, 0])
I misinterpreted the expected time between cancellations to be the mean of the times between two cancellations; in the solution the column is the max - min scheduled departure divided by the number of cancellations:
In [66]:
# def cancellations_by_airport(airline):
# temp = flights[(flights.cancelled==1) & (flights.airline==airline)]\
# .sort_values(by=['scheduled_departure', 'flight_number'])\
# .loc[:, ['scheduled_departure', 'flight_number', 'origin_airport']]\
# .set_index('flight_number')
# temp['scheduled_departure'] = pd.to_datetime(temp['scheduled_departure'])
# temp['time_between'] = (temp.scheduled_departure - temp.scheduled_departure.shift(1)).astype('timedelta64[s]') / 3600
# temp = temp.groupby('origin_airport').aggregate([np.mean, 'count'])
# temp.columns = temp.columns.droplevel()
# return pd.DataFrame(flights[flights.airline==airline].groupby('origin_airport').year.count())\
# .merge(temp, left_index=True, right_index=True)\
# .rename(columns={'year':'number_flights', 'mean':'expected_time_between', 'count':'number_cancellations'})
In [69]:
# cancellations = cancellations_by_airport('DL')#.sort_values(by='number_cancellations', ascending=False)
So let's do it again this way:
In [8]:
cancellations = flights[flights.airline=='DL'].groupby('origin_airport').apply(
lambda x: pd.Series({
'number_flights': x.year.count(),
'number_cancellations': x.cancelled.sum(),
'expected_time_between': ((pd.to_datetime(x.scheduled_departure.max()) -
pd.to_datetime(x.scheduled_departure.min())).seconds/3600) / (x.cancelled.sum() + 1e-6)
})
)
In [9]:
cancellations
Out[9]:
In [14]:
# def cancellations_by_airport_month(airline):
# temp = flights[(flights.cancelled==1) & (flights.airline==airline)]\
# .sort_values(by=['scheduled_departure', 'flight_number'])\
# .loc[:, ['scheduled_departure', 'flight_number', 'origin_airport', 'month']]\
# .set_index('flight_number')
# temp['scheduled_departure'] = pd.to_datetime(temp['scheduled_departure'])
# temp['time_between'] = (temp.scheduled_departure - temp.scheduled_departure.shift(1)).astype('timedelta64[s]') / 3600
# temp = temp.groupby(['origin_airport', 'month']).aggregate([np.mean, 'count'])
# temp.columns = temp.columns.droplevel()
# return pd.DataFrame(flights[flights.airline==airline].groupby(['origin_airport', 'month']).year.count())\
# .merge(temp, left_index=True, right_index=True)\
# .rename(columns={'year':'number_flights', 'mean':'expected_time_between', 'count':'number_cancellations'})
In [17]:
def cancellations_by_airport_month(airline):
cancellations_by_month = flights[flights.airline==airline].groupby(['origin_airport', 'year', 'month']).apply(
lambda x: pd.Series({
'number_flights': x.year.count(),
'number_cancellations': x.cancelled.sum(),
'expected_time_between': ((pd.to_datetime(x.scheduled_departure.max()) -
pd.to_datetime(x.scheduled_departure.min())).seconds/3600) / (x.cancelled.sum() + 1e-6)
})
)
return cancellations_by_month
In [18]:
top10 = ['ATL', 'ORD', 'DFW', 'DEN', 'LAX', 'IAH', 'SFO', 'PHX', 'LAS', 'LGA']
cancellation_month = cancellations_by_airport_month('DL').loc[top10].reset_index()
In [19]:
cancellation_month
Out[19]:
In [28]:
fig = plt.figure(figsize=(16, 8))
ax = plt.axes()
ax = sns.pointplot(x='month', y='expected_time_between', hue='origin_airport', data=cancellation_month)
ax.set_title('Expected time between cancellations for top 10 airports')
ax.set_ylabel('Mean expected time (hours)')
ax.set_xlabel('Month')
ax.set_yscale('log')
ax.legend(title='Airport');
In [33]:
import math
mu = cancellations_by_airport_month('DL').loc['BOS', 2015, 5].expected_time_between
x = 5
print(stats.poisson.sf(k=x, mu=mu))
In [38]:
cancellation_counts = flights[flights.airline.isin(['WN', 'DL', 'AA'])].pivot_table(values='year', columns='cancelled', index='airline', aggfunc='count')
cancellation_counts
Out[38]:
In [39]:
stats.chi2_contingency(cancellation_counts)
Out[39]:
The expected frequencies are all above 5 and the p-value is 0, so we can reject the null hypothesis that there is no difference between the cancellation rates for the top 3 airlines.
In [40]:
travel_routes = flights[(flights.airline=='DL')
& ((flights.origin_airport.isin(top10))
& (flights.destination_airport.isin(top10)))]\
.loc[:, ['flight_number', 'origin_airport', 'destination_airport', 'diverted']]
# travel_routes['route'] = travel_routes.origin_airport + '-' + travel_routes.destination_airport
# travel_routes.groupby('route').diverted.sum() / travel_routes.groupby('route').diverted.count()
travel_routes.groupby(['origin_airport', 'destination_airport']).diverted.mean().unstack()
Out[40]:
In [43]:
plt.figure(figsize=(16, 10))
sns.heatmap(travel_routes.groupby(['origin_airport', 'destination_airport']).diverted.mean().unstack());
In [24]:
flights['ground_speed'] = flights.distance * 1.60934 / flights.air_time * 60
print(flights.sort_values(by='ground_speed').ground_speed.head())
print(flights.sort_values(by='ground_speed', ascending=False).ground_speed.head())
print(flights.iloc[203450])
print(flights.iloc[546104])
There are some infinite values due to an airtime of 0 (we have some data cleaning to do I would say) and some very low. It seems that using airtime and distance isn't a good way to measure the ground speed in case of delays for example.
In [45]:
delays = flights.groupby('airline')['air_system_delay', 'security_delay', 'airline_delay', 'late_aircraft_delay', 'weather_delay'].mean().unstack().reset_index()
delays.columns = ['reason', 'airline', 'mean_time']
fig = plt.figure(figsize=(16, 8))
ax = sns.barplot(x='airline', y='mean_time', data=delays, hue='reason');
I interpreted the delay proportions as the proportion of delays over the total number of flights:
In [26]:
delay_proportion = (flights.groupby('airline').airline_delay.count() / flights.groupby('airline').year.count()).reset_index()
delay_proportion.columns = ['airline', 'delay_proportion']
fig = plt.figure(figsize=(10, 10))
ax = sns.barplot(x='delay_proportion', y='airline', data=delay_proportion);
This is the way it is done in the solutions:
In [56]:
ax = delays.pivot_table(columns='reason', index='airline', values='mean_time').plot.barh(figsize=(16, 10), stacked=True);
In [27]:
flights_sample = flights[flights.airline.isin(['AA', 'DL', 'WN'])].sample(frac=0.1)
plt.figure(figsize=(16, 10))
pd.tools.plotting.parallel_coordinates(flights_sample[['air_system_delay', 'security_delay', 'airline_delay', 'late_aircraft_delay', 'weather_delay', 'airline']], 'airline');
The exponential distribution has a special property that is referred to as the memoryless property. In terms of time, it states that the probability of an exponential random variable exceeding some value does not depend on how much time has already elapsed. Mathematically it looks like
$$P(X > s + t | X > t) = P(X > s)$$Where $t$ is the amount of time already elapsed.
Pandas offers easy to use tools for working with time series. A couple of these tools include resampling the frequency (downsampling and upsampling), and easily plotting time series.
resample() method that let's us easily downsample and upsample time seriesset_index() method to do this.
In [57]:
cycle = pd.read_csv('data/trip_cleaned.csv')
In [4]:
cycle.info()
In [58]:
temperatures = cycle[['starttime', 'Min_TemperatureF', 'Mean_Temperature_F', 'Max_Temperature_F']].copy()
temperatures['starttime'] = pd.to_datetime(temperatures.starttime)
temperatures.set_index('starttime', inplace=True)
In [59]:
fig = plt.figure(figsize=(16, 10))
ax = temperatures.Min_TemperatureF.plot(kind='line', style=':', color='b', label='Min')
ax = temperatures.Max_Temperature_F.plot(kind='line', style=':', color='r', label='Max')
ax = temperatures.Mean_Temperature_F.plot(kind='line', style='-', color='k', label='Mean')
ax.set_title('Temperature variations')
ax.set_xlabel('Date')
ax.set_ylabel('Temp (F)')
ax.legend();
In [60]:
temperatures_daily = temperatures.resample('D').mean()
In [61]:
temperatures_roll = temperatures_daily.rolling(7, center=True).mean().dropna()
In [62]:
fig = plt.figure(figsize=(16, 10))
ax = temperatures_roll.Min_TemperatureF.plot(kind='line', style=':', color='b', label='Min')
ax = temperatures_roll.Max_Temperature_F.plot(kind='line', style=':', color='r', label='Max')
ax = temperatures_roll.Mean_Temperature_F.plot(kind='line', style='-', color='k', label='Mean')
ax.set_title('Temperature 7-days rolling mean')
ax.set_xlabel('Date')
ax.set_ylabel('Temp (F)')
ax.legend();
In [66]:
trips = cycle[['trip_id', 'starttime', 'tripduration', 'from_station_name', 'usertype']].copy()
trips['starttime'] = pd.to_datetime(trips.starttime)
# trips.set_index('starttime', inplace=True)
In [6]:
trips_hourly = trips[trips.starttime.dt.month==7].set_index('starttime')
trips_hourly = trips_hourly.resample('H').trip_id.count().reset_index()
trips_hourly = trips_hourly.groupby(trips_hourly.starttime.dt.hour).trip_id.sum() / 31
In [7]:
fig = plt.figure(figsize=(16, 10))
ax = trips_hourly.plot(kind='line')
ax.set_title('Average number of trips in July')
ax.set_xlabel('Hour')
ax.set_ylabel('Average number of trips')
ax.xaxis.set_major_locator(plt.MaxNLocator(12));
In [8]:
trips_hour_station = trips[trips.starttime.dt.month==7].pivot_table(index='starttime', columns='from_station_name', values='trip_id', aggfunc='count')
trips_hour_station = trips_hour_station.resample('H').count()#.reset_index()
trips_hour_station = trips_hour_station.groupby(trips_hour_station.index.hour).sum() / 31
In [9]:
ax = trips_hour_station.iloc[:,0:10].plot(kind='line', figsize=(16, 10))
ax.set_title('Average number of trips per station in July')
ax.set_xlabel('Hour')
ax.set_ylabel('Average number of trips')
ax.set_xticks(range(0, 23, 2));
In [67]:
may = trips[trips.starttime.dt.month==5]
In [68]:
fig = plt.figure(figsize=(16, 10))
ax = sns.distplot(may.tripduration);
I would describe this distribution as exponential, the expected value and $P(X>1200)$ are
In [69]:
print(may.tripduration.mean())
print(stats.expon.sf(1200, scale=may.tripduration.mean()))
The probability for the second case is the same as the first because of the memorilessness of the exponentiali distribution.
The probability for the third case is $P(X > 4 \cdot 60)$:
In [70]:
print(stats.expon.sf(240, scale=may.tripduration.mean()))
In [82]:
ax = trips[trips.starttime.dt.month.isin([6, 7, 8])].boxplot(column='tripduration', by='from_station_name', figsize=(18,10), rot=90)
ax.set_ylim(0, 8000)
ax.set_title('Trip duration by station in June, July and August')
ax.set_xlabel('Station');
In [87]:
plt.figure(figsize=(16, 8))
ax = sns.violinplot(x=trips.starttime.dt.year, y=trips.tripduration, hue=trips.usertype, split=True)
In [98]:
water = cycle[['starttime', 'Precipitation_In', 'Mean_Humidity']].copy()
water['starttime'] = pd.to_datetime(water.starttime)
water = water[water.starttime.dt.year==2016]
water.set_index('starttime', inplace=True)
In [103]:
ax = water.resample('D').Precipitation_In.mean().plot(label='Mean Precipitation', figsize=(16, 8), legend=True)
ax = water.resample('D').Mean_Humidity.mean().plot(secondary_y=True, label='Mean Humidity', legend=True)
ax.set_xlabel('Date')
ax.set_title('Precipitation and humidity in 2016');