In [1]:
import scipy as sci
import pandas as pd
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Remember that for specific functions, the array function in numpy
# can be useful in listing out the elements in a list (example would
# be for finding the mode.)
with open('./data/sat_scores.csv', 'r') as f:
data = [i.split(",") for i in f.read().split()]
print data
The data describes SAT scores for verbal and math sections in 2001 across the US. It does appear to be complete, except for the issue I'm having with the median score for math. When I ran the median function for sat_scores.math, it returned a value of 521. However, I could not find that value in the dataset. Below are some other observations I made.
Overall, the data does look complete, but doing my EDA I noticed that the median value computed for Math, 521, does not actually appear in the list of Math scores. There must be an issue with the data.
In [2]:
header = data[0]
data = data[1:]
print(header)
In [3]:
sat_data = {}
for index, column_name in enumerate(header):
sat_data[column_name] = []
for row in data:
sat_data[column_name].append(row[index])
In [4]:
state_names = sat_data['State']
print state_names
In [5]:
print 'The type of the State column is' + ' ' + str(type (sat_data['State'][2]))
print 'The type of the State column is' + ' ' + str(type (sat_data['Math'][2]))
print 'The type of the State column is' + ' ' + str(type (sat_data['Verbal'][2]))
print 'The type of the State column is' + ' ' + str(type (sat_data['Rate'][2]))
In [6]:
#Math, Verbal, and Rate need to be reassigned to integers.
for item in sat_data['Math']:
item = int(item)
In [7]:
for item in sat_data['Verbal']:
item = int(item)
In [8]:
for item in sat_data['Rate']:
item = int(item)
In [9]:
verbal_values = {x:sat_data['Verbal'] for x in state_names}
math_values = {x:sat_data['Math'] for x in state_names}
rate_values = {x:sat_data['Rate'] for x in state_names}
In [10]:
#SAT_values = {x:sat_data['Verbal'] for x in sat_data['Verbal']}
In [11]:
#Convert to a pandas dataframe to perform functions.
SAT_scores = pd.DataFrame(sat_data)
SAT_scores['Math'] = SAT_scores.Math.astype(int)
SAT_scores['Verbal'] = SAT_scores.Verbal.astype(int)
SAT_scores['Rate'] = SAT_scores.Rate.astype(int)
In [12]:
print 'The minimum Verbal score is' + ' ' + str(min(SAT_scores.Verbal))
print 'The maximum Verbal score is' + ' ' + str(max(SAT_scores.Verbal))
print 'The minimum Math score is' + ' ' + str(min(SAT_scores.Math))
print 'The maximum Math score is' + ' ' + str(max(SAT_scores.Math))
print 'The minimum Rate is' + ' ' + str(min(SAT_scores.Rate))
print 'The maximum Rate is' + ' ' + str(max(SAT_scores.Rate))
In [13]:
#Standard Deviation function.
from math import sqrt
def standard_deviation(column):
num_int = len(column)
mean = sum(column)/len(column)
differences = [x - mean for x in column]
sq_diff = [t ** 2 for t in differences]
num = sum(sq_diff)
den = len(column)-1
var = num/den
print sqrt(var)
In [14]:
standard_deviation(SAT_scores['Math'])
In [15]:
standard_deviation(SAT_scores['Verbal'])
In [16]:
standard_deviation(SAT_scores['Rate'])
In [18]:
#Check to see the standard deviations are right.
print SAT_scores.describe()
#Approximately on point.
In [19]:
# Find the mean, median, and mode for the set of verbal scores and the set of math scores.
import numpy as np
print np.median(SAT_scores.Verbal)
print np.median(SAT_scores.Math)
#Numpy doesn't have a built in function for mode. However, stats does;
#its function returns the mode, and how many times the mode appears.
verb_mode = stats.mode(SAT_scores.Verbal)
math_mode = stats.mode(SAT_scores.Math)
print verb_mode
print math_mode
In [ ]:
#Will be using Pandas dataframe for plotting.
In [20]:
import seaborn as sns
import matplotlib.pyplot as plt
In [21]:
sns.pairplot(SAT_scores)
plt.show()
Both Verbal and Math scores are highly correlated with each other, whichever way you plot them, with Math appearing to affect Verbal at a faster rate than the other way around.
In [22]:
# Not really. I had already assigned the Verbal, Math, and Rate columns to integers,
# so no conversion is needed there.
In [57]:
SAT_scores['Verbal'] = SAT_scores['Verbal'].apply(pd.to_numeric)
In [58]:
SAT_scores['Math'] = SAT_scores['Math'].apply(pd.to_numeric)
In [59]:
SAT_scores['Rate'] = SAT_scores['Rate'].apply(pd.to_numeric)
In [60]:
SAT_scores.dtypes
Out[60]:
In [24]:
# Display box plots to visualize the distribution of the datasets.
# Recall the median verbal score is 526, the mean is 532, the max is 593, the min is 482,
# and the std. deviation is 33.236.
In [25]:
ax = sns.boxplot(y=SAT_scores.Verbal, saturation=0.75, width=0.1, fliersize=5)
ax.set(xlabel = 'SAT Verbal Scores', ylabel = 'Range of Scores')
ax.set_title('2001 Iowa Verbal Scores Distribution', fontsize = 15)
plt.show()
In [27]:
sns.boxplot(data = SAT_scores, y=SAT_scores.Math, saturation=0.75, width=0.1, fliersize=5)
plt.xlabel('SAT Math Scores')
plt.ylabel('Range of Scores')
plt.show()
In [93]:
sns.boxplot(data = SAT_scores, y=SAT_scores.Rate, saturation=0.75, width=0.1, fliersize=5)
plt.xlabel('SAT Rates')
plt.ylabel('Range of Rates')
plt.show()
In [81]:
SAT_scores.Math.plot (kind='hist', bins=15)
plt.xlabel('SAT Math Scores')
plt.ylabel('Frequency')
plt.show()
In [28]:
SAT_scores.Verbal.plot (kind='hist', bins=15)
plt.xlabel('SAT Verbal Scores')
plt.ylabel('Frequency')
plt.show()
In [82]:
SAT_scores.Rate.plot (kind='hist', bins=15)
plt.xlabel('SAT Rates')
plt.ylabel('Frequency')
plt.show()
In [29]:
# Used seaborn website as guidance: http://seaborn.pydata.org/tutorial/distributions.html
# I used a feature called the 'Kernel Density Estimation" (KDE) to
# visualize a distribution to the data.
# KDE is an estimator that uses each data point to make an estimate of the distibution and attempts to
# smooth it out on the histogram.
# This resulting curve has an area below it equal to one, hence the decimal units for frequency.
sns.distplot(SAT_scores.Verbal, bins=15)
plt.xlabel('SAT Verbal Scores')
plt.ylabel('Frequency (KDE)')
plt.show()
In [30]:
sns.distplot(SAT_scores.Math, bins=15)
plt.xlabel('SAT Math Scores')
plt.ylabel('Frequency (KDE)')
plt.show()
In [31]:
sns.distplot(SAT_scores.Rate, bins=15)
plt.xlabel('SAT Rates')
plt.ylabel('Frequency (KDE)')
plt.show()
In [32]:
sns.kdeplot(SAT_scores.Verbal)
plt.xlabel('SAT Verbal Scores')
plt.ylabel('Frequency (KDE)')
plt.show()
In [33]:
sns.kdeplot(SAT_scores.Math)
plt.xlabel('SAT Math Scores')
plt.ylabel('Frequency (KDE)')
plt.show()
In [34]:
sns.kdeplot(SAT_scores.Rate)
plt.xlabel('SAT Rates')
plt.ylabel('Frequency (KDE)')
plt.show()