In [1]:
import numpy as np
import pandas as pd
In [2]:
test_scores = [70,65,95,88]
type(test_scores)
Out[2]:
In [3]:
scores = np.array(test_scores)
type(scores)
Out[3]:
In [4]:
scores.mean()
Out[4]:
In [5]:
print(scores.max(), scores.min())
In [6]:
#More Calculations
income = np.array([75000, 55000, 88000, 125000, 64000, 97000])
In [7]:
print(income.mean())
print(income.max())
print(income.min())
In [8]:
income = np.append(income, 12000000)
income.mean()
Out[8]:
In [9]:
np.median(income)
Out[9]:
In [10]:
# Standard Deviation
income.std()
Out[10]:
In [11]:
scores.std()
Out[11]:
In [12]:
scores.sum()
Out[12]:
In [13]:
# MATRICES
np.random.seed(seed=60)
# create a 5 by 5 Matrix
random_square = np.random.rand(5,5)
random_square
Out[13]:
In [14]:
random_square[0]
Out[14]:
In [15]:
# Get column specified e.g. 0
random_square[:,0]
Out[15]:
In [16]:
random_square[0,0] == random_square[0][0]
Out[16]:
In [17]:
random_square.mean()
Out[17]:
In [18]:
# Mean of specified Row
random_square[0].mean()
Out[18]:
In [19]:
# Mean of specified column
random_square[:,0].mean()
Out[19]:
In [20]:
# Mean of last column
random_square[:,-1].mean()
Out[20]:
In [21]:
import time
In [22]:
%%time
# How long it takes to generate a 100000 by 100 matrix
np.random.seed(seed=60)
big_matrix = np.random.rand(100000, 100)
In [23]:
%%time
# duration to calculate mean
big_matrix = np.random.rand(100000, 100)
big_matrix.mean()
In [24]:
np.arange(1,101)
Out[24]:
In [25]:
# Reshape to 20 by 5
np.arange(1, 101).reshape(20,5)
Out[25]:
In [26]:
mat1 = np.arange(1, 101).reshape(20,5)
mat1 - 50
Out[26]:
In [27]:
mat1 * 10
Out[27]:
In [28]:
mat1 + mat1
Out[28]:
In [29]:
mat1 * mat1
Out[29]:
In [30]:
#take the dot product of mat1 and mat1.T
np.dot(mat1, mat1.T)
Out[30]:
In [31]:
# Create dictionary of test scores
test_dict = {'Corey':[63,75,88], 'Kevin':[48,98,92], 'Akshay': [87, 86, 85]}
In [32]:
# Create DataFrame
df = pd.DataFrame(test_dict)
In [33]:
df
Out[33]:
In [34]:
df.describe()
Out[34]:
In [35]:
# Transpose the DataFrame
df = df.T
df
Out[35]:
In [36]:
# Rename the columns
df.columns = ['Quiz_1', 'Quiz_2', 'Quiz_3']
df
Out[36]:
In [37]:
# Access first row by index number
df.iloc[0]
Out[37]:
In [38]:
df['Quiz_1']
Out[38]:
In [39]:
# Access first column using dot notation
df.Quiz_1
Out[39]:
In [40]:
# Limit DataFrame to first 2 rows
df[0:2]
Out[40]:
In [41]:
# Defining a new DataFrame from first 2 rows and last 2 columns
rows = ['Corey', 'Kevin']
cols = ['Quiz_2', 'Quiz_3']
df_spring = df.loc[rows, cols]
df_spring
Out[41]:
In [42]:
# Select first 2 rows and last 2 columns using index numbers
df.iloc[[0,1], [1,2]]
Out[42]:
In [43]:
# Define new column as mean of other columns
df['Quiz_Avg'] = df.mean(axis=1)
df
Out[43]:
In [44]:
# Create a new column
df['Quiz_4'] = [92, 95, 88]
df
Out[44]:
In [45]:
# Delete Column
del df['Quiz_Avg']
df
Out[45]:
In [46]:
# Create new DataFrame of one row
df_new = pd.DataFrame({'Quiz_1':[np.NaN], 'Quiz_2':[np.NaN], 'Quiz_3': [np.NaN], 'Quiz_4':[71]}, index=['Adrian'])
In [47]:
df_new
Out[47]:
In [48]:
# Concatenate DataFrames
df = pd.concat([df, df_new])
# Display new DataFrame
df
Out[48]:
In [49]:
df['Quiz_Avg'] = df.mean(axis=1, skipna=True)
df
Out[49]:
In [50]:
df.Quiz_4.astype(float)
Out[50]:
In [51]:
df
Out[51]:
In [52]:
housing_df = pd.read_csv('data/HousingData.csv')
In [53]:
housing_df.info()
In [54]:
housing_df.describe()
Out[54]:
In [55]:
housing_df.head()
Out[55]:
In [56]:
housing_df.shape
Out[56]:
In [57]:
housing_df.isnull().any()
Out[57]:
In [58]:
# show some records with NULL values
housing_df.loc[:5, housing_df.isnull().any()]
Out[58]:
In [59]:
# Data Description of columns with any Null values
housing_df.loc[:, housing_df.isnull().any()].describe()
Out[59]:
In [61]:
# Replace Null values in the data
housing_df['AGE'] = housing_df['AGE'].fillna(housing_df.mean())
In [62]:
housing_df['CHAS'] = housing_df['CHAS'].fillna(0)
In [63]:
housing_df = housing_df.fillna(housing_df.median())
In [64]:
housing_df.info()
In [65]:
import matplotlib.pyplot as plt
%matplotlib inline
In [66]:
# !pip uninstall seaborn -y
import sys
sys.version
Out[66]:
In [67]:
# !pip3 install seaborn --upgrade
import seaborn as sns
sns.set()
In [68]:
plt.hist(housing_df['MEDV'])
plt.show()
In [69]:
plt.hist(housing_df['MEDV'])
plt.title('Median Boston Housing Prices')
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.show()
In [70]:
housing_df.columns
Out[70]:
In [71]:
# Tweak more
title = 'Median Boston Housing Prices'
plt.figure(figsize=(10,6))
plt.hist(housing_df['MEDV'])
plt.title(title, fontsize=15)
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.savefig(title, dpi=300)
plt.show()
In [72]:
def my_hist(column, title, xlab, ylab):
title = title
plt.figure(figsize=(10,6))
plt.hist(column)
plt.title(title, fontsize=15)
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.savefig(title, dpi=300)
plt.show()
In [73]:
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')
In [74]:
# Optimizing solution
def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):
title = title
plt.figure(figsize=(10,6))
plt.hist(column, bins=bins, range=(3,9), alpha=alpha, color=color)
plt.title(title, fontsize=15)
plt.xlabel(xlab)
plt.ylabel(ylab)
plt.savefig(title, dpi=300)
plt.show()
In [75]:
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')
In [76]:
# SCATTER PLOTS
x = housing_df['RM']
y = housing_df['MEDV']
plt.scatter(x, y)
plt.show()
In [77]:
# CORRELATION - Data points Correlation
housing_df.corr()
Out[77]:
In [78]:
# Using Seaborn to visualize the Correlation
corr = housing_df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, xticklabels=corr.columns.values,
yticklabels=corr.columns.values, cmap="Blues", linewidths=1.25, alpha=0.8)
plt.show()
In [79]:
# REGRESSION
plt.figure(figsize=(13, 8))
sns.regplot(x,y)
plt.show()
In [85]:
# More details on Regression line
import statsmodels.api as sm
X = sm.add_constant(x)
model = sm.OLS(y, X)
est = model.fit()
print(est.summary())
# The strangest part of the code is adding the constant. This is basically the y-intercept.
# When the constant is not added, the y-intercept is 0. In our case, it makes sense that
# the y-intercept would be 0: if there are 0 rooms, the house should have no value. In
# general, however, it's a good idea to keep a y-intercept, and it's the default choice of
# the preceding Seaborn graph.
There's a lot of important information in this table. The first is the value of R^2. This suggests that 48% of the data can be explained by the regression line. The second is the coefficient constant of -34.6706. This is the y-intercept. The third is the RM coefficient of 9.1021. This suggests that for every one-bedroom increase, the value of the house increased by 9,102. (Keep in mind that this is from 1980.)
The standard error suggests how far off the actual values are from the line on average, and the numbers underneath the [0.025 0.975] column give the 95% Confidence Interval of the value, meaning statsmodel is 95% confident that the true increase in the value of the average house for every one-bedroom increase is between 8,279 and 9,925.
In [ ]:
# !pip3 uninstall statsmodels #--upgrade
sys.version
In [89]:
# BOX PLOTS
plt.figure(figsize=(12, 8))
x = housing_df['RM']
y = housing_df['MEDV']
plt.boxplot(x)
plt.show()
In [90]:
# VIOLIN PLOTS
plt.figure(figsize=(10,8))
plt.violinplot(x)
plt.show()
In [91]:
# TASK: Salary Visualization
uk_salary_df = pd.read_csv('data/UKStatistics.csv')
In [92]:
uk_salary_df.info()
In [94]:
# GATHER NULL INFO IN DATASET
# housing_df.loc[:, housing_df.isnull().any()].describe()
uk_salary_df.loc[:, uk_salary_df.isnull().any()].describe()
Out[94]:
In [100]:
# uk_salary_df.head()
uk_salary_df['Unnamed: 15'].dropna()
# uk_salary_df.columns
Out[100]:
In [102]:
del uk_salary_df['Unnamed: 15']
In [103]:
uk_salary_df.columns
Out[103]:
In [107]:
# Histogram: Actual Pay Floor (£) [def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):]
my_hist("Actual Pay Floor (£)", "UK Actual Pay", "pay", "level", bins=7, alpha=0.8, color='b')
In [106]:
title = 'UK Salary'
plt.figure(figsize=(10,6))
plt.hist(uk_salary_df['Actual Pay Floor (£)'])
plt.title(title, fontsize=15)
plt.xlabel('Salary in Thousands')
plt.ylabel('Level')
plt.savefig(title, dpi=300)
plt.show()
In [112]:
# scatter plot using the values for x as Salary Cost of Reports (£) and y as Actual Pay Floor (£).
plt.figure(figsize=(14,8))
x = uk_salary_df['Salary Cost of Reports (£)'].map(lambda x: x * 1000)
y = uk_salary_df['Actual Pay Floor (£)']
plt.scatter(x, y)
plt.show()
In [113]:
# BOX PLOT
plt.figure(figsize=(14,8))
plt.boxplot(x)
plt.show()
In [114]:
plt.figure(figsize=(14,8))
plt.boxplot(y)
plt.show()
In [115]:
uk_salary_df.describe()
Out[115]:
In [116]:
uk_salary_df.info()
In [ ]: