Calculations witn Numpy - Statistics



In [1]:

    
import numpy as np
import pandas as pd



In [2]:

    
test_scores = [70,65,95,88]
type(test_scores)









    Out[2]:





list



In [3]:

    
scores = np.array(test_scores)
type(scores)









    Out[3]:





numpy.ndarray



In [4]:

    
scores.mean()









    Out[4]:





79.5



In [5]:

    
print(scores.max(), scores.min())



In [6]:

    
#More Calculations
income = np.array([75000, 55000, 88000, 125000, 64000, 97000])



In [7]:

    
print(income.mean())
print(income.max())
print(income.min())



In [8]:

    
income = np.append(income, 12000000)
income.mean()









    Out[8]:





1786285.7142857143



In [9]:

    
np.median(income)









    Out[9]:





88000.0



In [10]:

    
# Standard Deviation
income.std()









    Out[10]:





4169786.007331644



In [11]:

    
scores.std()









    Out[11]:





12.379418403139947



In [12]:

    
scores.sum()









    Out[12]:





318



In [13]:

    
# MATRICES
np.random.seed(seed=60)
# create a 5 by 5 Matrix
random_square = np.random.rand(5,5)
random_square









    Out[13]:





array([[0.30087333, 0.18694582, 0.32318268, 0.66574957, 0.5669708 ],
       [0.39825396, 0.37941492, 0.01058154, 0.1703656 , 0.12339337],
       [0.69240128, 0.87444156, 0.3373969 , 0.99245923, 0.13154007],
       [0.50032984, 0.28662051, 0.22058485, 0.50208555, 0.63606254],
       [0.63567694, 0.08043309, 0.58143375, 0.83919086, 0.29301825]])



In [14]:

    
random_square[0]









    Out[14]:





array([0.30087333, 0.18694582, 0.32318268, 0.66574957, 0.5669708 ])



In [15]:

    
# Get column specified e.g. 0
random_square[:,0]









    Out[15]:





array([0.30087333, 0.39825396, 0.69240128, 0.50032984, 0.63567694])



In [16]:

    
random_square[0,0] == random_square[0][0]









    Out[16]:





True



In [17]:

    
random_square.mean()









    Out[17]:





0.42917627159618377



In [18]:

    
# Mean of specified Row
random_square[0].mean()









    Out[18]:





0.4087444389228477



In [19]:

    
# Mean of specified column
random_square[:,0].mean()









    Out[19]:





0.5055070681931644



In [20]:

    
# Mean of last column
random_square[:,-1].mean()









    Out[20]:





0.35019700684996913



In [21]:

    
import time

Computation Times Calculations



In [22]:

    
%%time
# How long it takes to generate a 100000 by 100 matrix
np.random.seed(seed=60)
big_matrix = np.random.rand(100000, 100)









    



CPU times: user 216 ms, sys: 43.6 ms, total: 260 ms
Wall time: 547 ms



In [23]:

    
%%time
# duration to calculate mean
big_matrix = np.random.rand(100000, 100)
big_matrix.mean()









    



CPU times: user 178 ms, sys: 36.7 ms, total: 215 ms
Wall time: 235 ms

Dealing with NPArrays



In [24]:

    
np.arange(1,101)









    Out[24]:





array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100])



In [25]:

    
# Reshape to 20 by 5
np.arange(1, 101).reshape(20,5)









    Out[25]:





array([[  1,   2,   3,   4,   5],
       [  6,   7,   8,   9,  10],
       [ 11,  12,  13,  14,  15],
       [ 16,  17,  18,  19,  20],
       [ 21,  22,  23,  24,  25],
       [ 26,  27,  28,  29,  30],
       [ 31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45],
       [ 46,  47,  48,  49,  50],
       [ 51,  52,  53,  54,  55],
       [ 56,  57,  58,  59,  60],
       [ 61,  62,  63,  64,  65],
       [ 66,  67,  68,  69,  70],
       [ 71,  72,  73,  74,  75],
       [ 76,  77,  78,  79,  80],
       [ 81,  82,  83,  84,  85],
       [ 86,  87,  88,  89,  90],
       [ 91,  92,  93,  94,  95],
       [ 96,  97,  98,  99, 100]])



In [26]:

    
mat1 = np.arange(1, 101).reshape(20,5)
mat1 - 50









    Out[26]:





array([[-49, -48, -47, -46, -45],
       [-44, -43, -42, -41, -40],
       [-39, -38, -37, -36, -35],
       [-34, -33, -32, -31, -30],
       [-29, -28, -27, -26, -25],
       [-24, -23, -22, -21, -20],
       [-19, -18, -17, -16, -15],
       [-14, -13, -12, -11, -10],
       [ -9,  -8,  -7,  -6,  -5],
       [ -4,  -3,  -2,  -1,   0],
       [  1,   2,   3,   4,   5],
       [  6,   7,   8,   9,  10],
       [ 11,  12,  13,  14,  15],
       [ 16,  17,  18,  19,  20],
       [ 21,  22,  23,  24,  25],
       [ 26,  27,  28,  29,  30],
       [ 31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40],
       [ 41,  42,  43,  44,  45],
       [ 46,  47,  48,  49,  50]])



In [27]:

    
mat1 * 10









    Out[27]:





array([[  10,   20,   30,   40,   50],
       [  60,   70,   80,   90,  100],
       [ 110,  120,  130,  140,  150],
       [ 160,  170,  180,  190,  200],
       [ 210,  220,  230,  240,  250],
       [ 260,  270,  280,  290,  300],
       [ 310,  320,  330,  340,  350],
       [ 360,  370,  380,  390,  400],
       [ 410,  420,  430,  440,  450],
       [ 460,  470,  480,  490,  500],
       [ 510,  520,  530,  540,  550],
       [ 560,  570,  580,  590,  600],
       [ 610,  620,  630,  640,  650],
       [ 660,  670,  680,  690,  700],
       [ 710,  720,  730,  740,  750],
       [ 760,  770,  780,  790,  800],
       [ 810,  820,  830,  840,  850],
       [ 860,  870,  880,  890,  900],
       [ 910,  920,  930,  940,  950],
       [ 960,  970,  980,  990, 1000]])



In [28]:

    
mat1 + mat1









    Out[28]:





array([[  2,   4,   6,   8,  10],
       [ 12,  14,  16,  18,  20],
       [ 22,  24,  26,  28,  30],
       [ 32,  34,  36,  38,  40],
       [ 42,  44,  46,  48,  50],
       [ 52,  54,  56,  58,  60],
       [ 62,  64,  66,  68,  70],
       [ 72,  74,  76,  78,  80],
       [ 82,  84,  86,  88,  90],
       [ 92,  94,  96,  98, 100],
       [102, 104, 106, 108, 110],
       [112, 114, 116, 118, 120],
       [122, 124, 126, 128, 130],
       [132, 134, 136, 138, 140],
       [142, 144, 146, 148, 150],
       [152, 154, 156, 158, 160],
       [162, 164, 166, 168, 170],
       [172, 174, 176, 178, 180],
       [182, 184, 186, 188, 190],
       [192, 194, 196, 198, 200]])



In [29]:

    
mat1 * mat1









    Out[29]:





array([[    1,     4,     9,    16,    25],
       [   36,    49,    64,    81,   100],
       [  121,   144,   169,   196,   225],
       [  256,   289,   324,   361,   400],
       [  441,   484,   529,   576,   625],
       [  676,   729,   784,   841,   900],
       [  961,  1024,  1089,  1156,  1225],
       [ 1296,  1369,  1444,  1521,  1600],
       [ 1681,  1764,  1849,  1936,  2025],
       [ 2116,  2209,  2304,  2401,  2500],
       [ 2601,  2704,  2809,  2916,  3025],
       [ 3136,  3249,  3364,  3481,  3600],
       [ 3721,  3844,  3969,  4096,  4225],
       [ 4356,  4489,  4624,  4761,  4900],
       [ 5041,  5184,  5329,  5476,  5625],
       [ 5776,  5929,  6084,  6241,  6400],
       [ 6561,  6724,  6889,  7056,  7225],
       [ 7396,  7569,  7744,  7921,  8100],
       [ 8281,  8464,  8649,  8836,  9025],
       [ 9216,  9409,  9604,  9801, 10000]])



In [30]:

    
#take the dot product of mat1 and mat1.T
np.dot(mat1, mat1.T)









    Out[30]:





array([[   55,   130,   205,   280,   355,   430,   505,   580,   655,
          730,   805,   880,   955,  1030,  1105,  1180,  1255,  1330,
         1405,  1480],
       [  130,   330,   530,   730,   930,  1130,  1330,  1530,  1730,
         1930,  2130,  2330,  2530,  2730,  2930,  3130,  3330,  3530,
         3730,  3930],
       [  205,   530,   855,  1180,  1505,  1830,  2155,  2480,  2805,
         3130,  3455,  3780,  4105,  4430,  4755,  5080,  5405,  5730,
         6055,  6380],
       [  280,   730,  1180,  1630,  2080,  2530,  2980,  3430,  3880,
         4330,  4780,  5230,  5680,  6130,  6580,  7030,  7480,  7930,
         8380,  8830],
       [  355,   930,  1505,  2080,  2655,  3230,  3805,  4380,  4955,
         5530,  6105,  6680,  7255,  7830,  8405,  8980,  9555, 10130,
        10705, 11280],
       [  430,  1130,  1830,  2530,  3230,  3930,  4630,  5330,  6030,
         6730,  7430,  8130,  8830,  9530, 10230, 10930, 11630, 12330,
        13030, 13730],
       [  505,  1330,  2155,  2980,  3805,  4630,  5455,  6280,  7105,
         7930,  8755,  9580, 10405, 11230, 12055, 12880, 13705, 14530,
        15355, 16180],
       [  580,  1530,  2480,  3430,  4380,  5330,  6280,  7230,  8180,
         9130, 10080, 11030, 11980, 12930, 13880, 14830, 15780, 16730,
        17680, 18630],
       [  655,  1730,  2805,  3880,  4955,  6030,  7105,  8180,  9255,
        10330, 11405, 12480, 13555, 14630, 15705, 16780, 17855, 18930,
        20005, 21080],
       [  730,  1930,  3130,  4330,  5530,  6730,  7930,  9130, 10330,
        11530, 12730, 13930, 15130, 16330, 17530, 18730, 19930, 21130,
        22330, 23530],
       [  805,  2130,  3455,  4780,  6105,  7430,  8755, 10080, 11405,
        12730, 14055, 15380, 16705, 18030, 19355, 20680, 22005, 23330,
        24655, 25980],
       [  880,  2330,  3780,  5230,  6680,  8130,  9580, 11030, 12480,
        13930, 15380, 16830, 18280, 19730, 21180, 22630, 24080, 25530,
        26980, 28430],
       [  955,  2530,  4105,  5680,  7255,  8830, 10405, 11980, 13555,
        15130, 16705, 18280, 19855, 21430, 23005, 24580, 26155, 27730,
        29305, 30880],
       [ 1030,  2730,  4430,  6130,  7830,  9530, 11230, 12930, 14630,
        16330, 18030, 19730, 21430, 23130, 24830, 26530, 28230, 29930,
        31630, 33330],
       [ 1105,  2930,  4755,  6580,  8405, 10230, 12055, 13880, 15705,
        17530, 19355, 21180, 23005, 24830, 26655, 28480, 30305, 32130,
        33955, 35780],
       [ 1180,  3130,  5080,  7030,  8980, 10930, 12880, 14830, 16780,
        18730, 20680, 22630, 24580, 26530, 28480, 30430, 32380, 34330,
        36280, 38230],
       [ 1255,  3330,  5405,  7480,  9555, 11630, 13705, 15780, 17855,
        19930, 22005, 24080, 26155, 28230, 30305, 32380, 34455, 36530,
        38605, 40680],
       [ 1330,  3530,  5730,  7930, 10130, 12330, 14530, 16730, 18930,
        21130, 23330, 25530, 27730, 29930, 32130, 34330, 36530, 38730,
        40930, 43130],
       [ 1405,  3730,  6055,  8380, 10705, 13030, 15355, 17680, 20005,
        22330, 24655, 26980, 29305, 31630, 33955, 36280, 38605, 40930,
        43255, 45580],
       [ 1480,  3930,  6380,  8830, 11280, 13730, 16180, 18630, 21080,
        23530, 25980, 28430, 30880, 33330, 35780, 38230, 40680, 43130,
        45580, 48030]])

Pandas Section



In [31]:

    
# Create dictionary of test scores
test_dict = {'Corey':[63,75,88], 'Kevin':[48,98,92], 'Akshay': [87, 86, 85]}



In [32]:

    
# Create DataFrame
df = pd.DataFrame(test_dict)



In [33]:

    
df



In [34]:

    
df.describe()



In [35]:

    
# Transpose the DataFrame
df = df.T
df



In [36]:

    
# Rename the columns
df.columns = ['Quiz_1', 'Quiz_2', 'Quiz_3']
df



In [37]:

    
# Access first row by index number
df.iloc[0]









    Out[37]:





Quiz_1    63
Quiz_2    75
Quiz_3    88
Name: Corey, dtype: int64



In [38]:

    
df['Quiz_1']









    Out[38]:





Corey     63
Kevin     48
Akshay    87
Name: Quiz_1, dtype: int64



In [39]:

    
# Access first column using dot notation
df.Quiz_1









    Out[39]:





Corey     63
Kevin     48
Akshay    87
Name: Quiz_1, dtype: int64



In [40]:

    
# Limit DataFrame to first 2 rows
df[0:2]



In [41]:

    
# Defining a new DataFrame from first 2 rows and last 2 columns 
rows = ['Corey', 'Kevin']
cols = ['Quiz_2', 'Quiz_3']
df_spring = df.loc[rows, cols]
df_spring



In [42]:

    
# Select first 2 rows and last 2 columns using index numbers 
df.iloc[[0,1], [1,2]]



In [43]:

    
# Define new column as mean of other columns
df['Quiz_Avg'] = df.mean(axis=1)
df



In [44]:

    
# Create a new column
df['Quiz_4'] = [92, 95, 88]
df



In [45]:

    
# Delete Column
del df['Quiz_Avg']
df



In [46]:

    
# Create new DataFrame of one row
df_new = pd.DataFrame({'Quiz_1':[np.NaN], 'Quiz_2':[np.NaN], 'Quiz_3': [np.NaN], 'Quiz_4':[71]}, index=['Adrian'])



In [47]:

    
df_new









    Out[47]:







  
    
      
      Quiz_1
      Quiz_2
      Quiz_3
      Quiz_4
    
  
  
    
      Adrian
      NaN
      NaN
      NaN
      71



In [48]:

    
# Concatenate DataFrames
df = pd.concat([df, df_new])
# Display new DataFrame
df



In [49]:

    
df['Quiz_Avg'] = df.mean(axis=1, skipna=True)
df



In [50]:

    
df.Quiz_4.astype(float)









    Out[50]:





Corey     92.0
Kevin     95.0
Akshay    88.0
Adrian    71.0
Name: Quiz_4, dtype: float64



In [51]:

    
df



In [52]:

    
housing_df = pd.read_csv('data/HousingData.csv')



In [53]:

    
housing_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       486 non-null float64
ZN         486 non-null float64
INDUS      486 non-null float64
CHAS       486 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        486 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      486 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB



In [54]:

    
housing_df.describe()



In [55]:

    
housing_df.head()



In [56]:

    
housing_df.shape









    Out[56]:





(506, 14)



In [57]:

    
housing_df.isnull().any()









    Out[57]:





CRIM        True
ZN          True
INDUS       True
CHAS        True
NOX        False
RM         False
AGE         True
DIS        False
RAD        False
TAX        False
PTRATIO    False
B          False
LSTAT       True
MEDV       False
dtype: bool



In [58]:

    
# show some records with NULL values
housing_df.loc[:5, housing_df.isnull().any()]



In [59]:

    
# Data Description of columns with any Null values
housing_df.loc[:, housing_df.isnull().any()].describe()

The choice depends on what you ultimately want to do with the data. If the goal is a straightforward data analysis, eliminating the rows with null values is worth consideration. However, if the goal is to use machine learning to predict data, then perhaps more is to be gained by changing the null values to a suitable replacement. It's impossible to know in advance.



In [61]:

    
# Replace Null values in the data
housing_df['AGE'] = housing_df['AGE'].fillna(housing_df.mean())



In [62]:

    
housing_df['CHAS'] = housing_df['CHAS'].fillna(0)



In [63]:

    
housing_df = housing_df.fillna(housing_df.median())



In [64]:

    
housing_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB

VISUALIZATION



In [65]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [66]:

    
# !pip uninstall seaborn -y
import sys
sys.version









    Out[66]:





'3.7.6 (default, Dec 30 2019, 19:38:26) \n[Clang 11.0.0 (clang-1100.0.33.16)]'



In [67]:

    
# !pip3 install seaborn --upgrade
import seaborn as sns
sns.set()



In [68]:

    
plt.hist(housing_df['MEDV'])
plt.show()



In [69]:

    
plt.hist(housing_df['MEDV'])
plt.title('Median Boston Housing Prices')
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.show()



In [70]:

    
housing_df.columns









    Out[70]:





Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'MEDV'],
      dtype='object')



In [71]:

    
# Tweak more
title = 'Median Boston Housing Prices'
plt.figure(figsize=(10,6))
plt.hist(housing_df['MEDV'])
plt.title(title, fontsize=15)
plt.xlabel('1980 Median Value in Thousands')
plt.ylabel('Count')
plt.savefig(title, dpi=300)
plt.show()



In [72]:

    
def my_hist(column, title, xlab, ylab):
    title = title
    plt.figure(figsize=(10,6))
    plt.hist(column)
    plt.title(title, fontsize=15)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.savefig(title, dpi=300)
    plt.show()



In [73]:

    
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')



In [74]:

    
# Optimizing solution
def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):
    title = title
    plt.figure(figsize=(10,6))
    plt.hist(column, bins=bins, range=(3,9), alpha=alpha, color=color)
    plt.title(title, fontsize=15)
    plt.xlabel(xlab)
    plt.ylabel(ylab)
    plt.savefig(title, dpi=300)
    plt.show()



In [75]:

    
my_hist(housing_df['RM'], 'Average Number of Rooms in Boston Households', 'Average Number of Rooms', 'Count')



In [76]:

    
# SCATTER PLOTS
x = housing_df['RM']
y = housing_df['MEDV']
plt.scatter(x, y) 
plt.show()



In [77]:

    
# CORRELATION - Data points Correlation
housing_df.corr()



In [78]:

    
# Using Seaborn to visualize the Correlation
corr = housing_df.corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, xticklabels=corr.columns.values, 
yticklabels=corr.columns.values, cmap="Blues", linewidths=1.25, alpha=0.8)
plt.show()



In [79]:

    
# REGRESSION
plt.figure(figsize=(13, 8))
sns.regplot(x,y)
plt.show()









    



/usr/local/lib/python3.7/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval



In [85]:

    
# More details on Regression line
import statsmodels.api as sm
X = sm.add_constant(x)
model = sm.OLS(y, X)
est = model.fit()
print(est.summary())
# The strangest part of the code is adding the constant. This is basically the y-intercept. 
# When the constant is not added, the y-intercept is 0. In our case, it makes sense that 
# the y-intercept would be 0: if there are 0 rooms, the house should have no value. In 
# general, however, it's a good idea to keep a y-intercept, and it's the default choice of 
# the preceding Seaborn graph.









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:                   MEDV   R-squared:                       0.484
Model:                            OLS   Adj. R-squared:                  0.483
Method:                 Least Squares   F-statistic:                     471.8
Date:                Wed, 15 Jan 2020   Prob (F-statistic):           2.49e-74
Time:                        13:12:23   Log-Likelihood:                -1673.1
No. Observations:                 506   AIC:                             3350.
Df Residuals:                     504   BIC:                             3359.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -34.6706      2.650    -13.084      0.000     -39.877     -29.465
RM             9.1021      0.419     21.722      0.000       8.279       9.925
==============================================================================
Omnibus:                      102.585   Durbin-Watson:                   0.684
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              612.449
Skew:                           0.726   Prob(JB):                    1.02e-133
Kurtosis:                       8.190   Cond. No.                         58.4
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

There's a lot of important information in this table. The first is the value of R^2. This suggests that 48% of the data can be explained by the regression line. The second is the coefficient constant of -34.6706. This is the y-intercept. The third is the RM coefficient of 9.1021. This suggests that for every one-bedroom increase, the value of the house increased by 9,102. (Keep in mind that this is from 1980.)

The standard error suggests how far off the actual values are from the line on average, and the numbers underneath the [0.025 0.975] column give the 95% Confidence Interval of the value, meaning statsmodel is 95% confident that the true increase in the value of the average house for every one-bedroom increase is between 8,279 and 9,925.



In [ ]:

    
# !pip3 uninstall statsmodels #--upgrade
sys.version



In [89]:

    
# BOX PLOTS
plt.figure(figsize=(12, 8))
x = housing_df['RM']
y = housing_df['MEDV']
plt.boxplot(x)
plt.show()



In [90]:

    
# VIOLIN PLOTS
plt.figure(figsize=(10,8))
plt.violinplot(x)
plt.show()



In [91]:

    
# TASK:  Salary Visualization
uk_salary_df = pd.read_csv('data/UKStatistics.csv')



In [92]:

    
uk_salary_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 19 columns):
Post Unique Reference              51 non-null object
Name                               51 non-null object
Grade (or equivalent)              51 non-null object
Job Title                          51 non-null object
Job/Team Function                  49 non-null object
Parent Department                  51 non-null object
Organisation                       51 non-null object
Unit                               49 non-null object
Contact Phone                      46 non-null object
Contact E-mail                     49 non-null object
Reports to Senior Post             51 non-null object
Salary Cost of Reports (£)         51 non-null int64
FTE                                51 non-null float64
Actual Pay Floor (£)               51 non-null int64
Actual Pay Ceiling (£)             51 non-null int64
Unnamed: 15                        0 non-null float64
Professional/Occupational Group    51 non-null object
Notes                              27 non-null object
Valid?                             51 non-null int64
dtypes: float64(2), int64(4), object(13)
memory usage: 7.6+ KB



In [94]:

    
# GATHER NULL INFO IN DATASET
# housing_df.loc[:, housing_df.isnull().any()].describe()
uk_salary_df.loc[:, uk_salary_df.isnull().any()].describe()









    Out[94]:







  
    
      
      Unnamed: 15
    
  
  
    
      count
      0.0
    
    
      mean
      NaN
    
    
      std
      NaN
    
    
      min
      NaN
    
    
      25%
      NaN
    
    
      50%
      NaN
    
    
      75%
      NaN
    
    
      max
      NaN



In [100]:

    
# uk_salary_df.head()
uk_salary_df['Unnamed: 15'].dropna()
# uk_salary_df.columns









    Out[100]:





Series([], Name: Unnamed: 15, dtype: float64)



In [102]:

    
del uk_salary_df['Unnamed: 15']



In [103]:

    
uk_salary_df.columns









    Out[103]:





Index(['Post Unique Reference', 'Name', 'Grade (or equivalent)', 'Job Title',
       'Job/Team Function', 'Parent Department', 'Organisation', 'Unit',
       'Contact Phone', 'Contact E-mail', 'Reports to Senior Post',
       'Salary Cost of Reports (£)', 'FTE', 'Actual Pay Floor (£)',
       'Actual Pay Ceiling (£)', 'Professional/Occupational Group', 'Notes',
       'Valid?'],
      dtype='object')



In [107]:

    
# Histogram: Actual Pay Floor (£) [def my_hist(column, title, xlab, ylab, bins=10, alpha=0.7, color='c'):]
my_hist("Actual Pay Floor (£)", "UK Actual Pay", "pay", "level", bins=7, alpha=0.8, color='b')



In [106]:

    
title = 'UK Salary'
plt.figure(figsize=(10,6))
plt.hist(uk_salary_df['Actual Pay Floor (£)'])
plt.title(title, fontsize=15)
plt.xlabel('Salary in Thousands')
plt.ylabel('Level')
plt.savefig(title, dpi=300)
plt.show()



In [112]:

    
# scatter plot using the values for x as Salary Cost of Reports (£) and y as Actual Pay Floor (£).
plt.figure(figsize=(14,8))
x = uk_salary_df['Salary Cost of Reports (£)'].map(lambda x: x * 1000)
y = uk_salary_df['Actual Pay Floor (£)']
plt.scatter(x, y) 
plt.show()



In [113]:

    
# BOX PLOT
plt.figure(figsize=(14,8))
plt.boxplot(x)
plt.show()



In [114]:

    
plt.figure(figsize=(14,8))
plt.boxplot(y)
plt.show()



In [115]:

    
uk_salary_df.describe()









    Out[115]:







  
    
      
      Salary Cost of Reports (£)
      FTE
      Actual Pay Floor (£)
      Actual Pay Ceiling (£)
      Valid?
    
  
  
    
      count
      5.100000e+01
      51.000000
      51.000000
      51.000000
      51.0
    
    
      mean
      1.651590e+06
      0.982157
      22745.098039
      23823.313725
      1.0
    
    
      std
      2.656579e+06
      0.101771
      45114.451404
      47133.322796
      0.0
    
    
      min
      0.000000e+00
      0.300000
      0.000000
      0.000000
      1.0
    
    
      25%
      0.000000e+00
      1.000000
      0.000000
      0.000000
      1.0
    
    
      50%
      1.105736e+06
      1.000000
      0.000000
      0.000000
      1.0
    
    
      75%
      2.175629e+06
      1.000000
      0.000000
      0.000000
      1.0
    
    
      max
      1.672047e+07
      1.000000
      150000.000000
      154999.000000
      1.0



In [116]:

    
uk_salary_df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 18 columns):
Post Unique Reference              51 non-null object
Name                               51 non-null object
Grade (or equivalent)              51 non-null object
Job Title                          51 non-null object
Job/Team Function                  49 non-null object
Parent Department                  51 non-null object
Organisation                       51 non-null object
Unit                               49 non-null object
Contact Phone                      46 non-null object
Contact E-mail                     49 non-null object
Reports to Senior Post             51 non-null object
Salary Cost of Reports (£)         51 non-null int64
FTE                                51 non-null float64
Actual Pay Floor (£)               51 non-null int64
Actual Pay Ceiling (£)             51 non-null int64
Professional/Occupational Group    51 non-null object
Notes                              27 non-null object
Valid?                             51 non-null int64
dtypes: float64(1), int64(4), object(13)
memory usage: 7.2+ KB



In [ ]:

	Corey	Kevin	Akshay
count	3.000000	3.000000	3.0
mean	75.333333	79.333333	86.0
std	12.503333	27.300794	1.0
min	63.000000	48.000000	85.0
25%	69.000000	70.000000	85.5
50%	75.000000	92.000000	86.0
75%	81.500000	95.000000	86.5
max	88.000000	98.000000	87.0

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
count	486.000000	486.000000	486.000000	486.000000	506.000000	506.000000	486.000000	506.000000	506.000000	506.000000	506.000000	506.000000	486.000000	506.000000
mean	3.611874	11.211934	11.083992	0.069959	0.554695	6.284634	68.518519	3.795043	9.549407	408.237154	18.455534	356.674032	12.715432	22.532806
std	8.720192	23.388876	6.835896	0.255340	0.115878	0.702617	27.999513	2.105710	8.707259	168.537116	2.164946	91.294864	7.155871	9.197104
min	0.006320	0.000000	0.460000	0.000000	0.385000	3.561000	2.900000	1.129600	1.000000	187.000000	12.600000	0.320000	1.730000	5.000000
25%	0.081900	0.000000	5.190000	0.000000	0.449000	5.885500	45.175000	2.100175	4.000000	279.000000	17.400000	375.377500	7.125000	17.025000
50%	0.253715	0.000000	9.690000	0.000000	0.538000	6.208500	76.800000	3.207450	5.000000	330.000000	19.050000	391.440000	11.430000	21.200000
75%	3.560262	12.500000	18.100000	0.000000	0.624000	6.623500	93.975000	5.188425	24.000000	666.000000	20.200000	396.225000	16.955000	25.000000
max	88.976200	100.000000	27.740000	1.000000	0.871000	8.780000	100.000000	12.126500	24.000000	711.000000	22.000000	396.900000	37.970000	50.000000

	CRIM	ZN	INDUS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
0	0.00632	18.0	2.31	0.538	6.575	65.2	4.0900	1	296	15.3	396.90	4.98	24.0
1	0.02731	0.0	7.07	0.469	6.421	78.9	4.9671	2	242	17.8	396.90	9.14	21.6
2	0.02729	0.0	7.07	0.469	7.185	61.1	4.9671	2	242	17.8	392.83	4.03	34.7
3	0.03237	0.0	2.18	0.458	6.998	45.8	6.0622	3	222	18.7	394.63	2.94	33.4
4	0.06905	0.0	2.18	0.458	7.147	54.2	6.0622	3	222	18.7	396.90	NaN	36.2

	CRIM	ZN	INDUS	AGE	LSTAT
0	0.00632	18.0	2.31	65.2	4.98
1	0.02731	0.0	7.07	78.9	9.14
2	0.02729	0.0	7.07	61.1	4.03
3	0.03237	0.0	2.18	45.8	2.94
4	0.06905	0.0	2.18	54.2	NaN
5	0.02985	0.0	2.18	58.7	5.21

	CRIM	ZN	INDUS	CHAS	AGE	LSTAT
count	486.000000	486.000000	486.000000	486.000000	486.000000	486.000000
mean	3.611874	11.211934	11.083992	0.069959	68.518519	12.715432
std	8.720192	23.388876	6.835896	0.255340	27.999513	7.155871
min	0.006320	0.000000	0.460000	0.000000	2.900000	1.730000
25%	0.081900	0.000000	5.190000	0.000000	45.175000	7.125000
50%	0.253715	0.000000	9.690000	0.000000	76.800000	11.430000
75%	3.560262	12.500000	18.100000	0.000000	93.975000	16.955000
max	88.976200	100.000000	27.740000	1.000000	100.000000	37.970000

	Quiz_1	Quiz_2	Quiz_3	Quiz_4
Corey	63.0	75.0	88.0	92
Kevin	48.0	98.0	92.0	95
Akshay	87.0	86.0	85.0	88
Adrian	NaN	NaN	NaN	71

	CRIM	ZN	INDUS	CHAS	NOX	RM	AGE	DIS	RAD	TAX	PTRATIO	B	LSTAT	MEDV
CRIM	1.000000	-0.185359	0.392632	-0.055585	0.410971	-0.220045	0.343427	-0.366025	0.601224	0.560469	0.277964	-0.365336	0.437417	-0.383895
ZN	-0.185359	1.000000	-0.507304	-0.032992	-0.498619	0.312295	-0.535341	0.632428	-0.300061	-0.304385	-0.394622	0.170125	-0.398838	0.362292
INDUS	0.392632	-0.507304	1.000000	0.054693	0.738387	-0.377978	0.614248	-0.698621	0.592735	0.716267	0.385366	-0.354840	0.564508	-0.476394
CHAS	-0.055585	-0.032992	0.054693	1.000000	0.070867	0.106797	0.074984	-0.092318	-0.003339	-0.035822	-0.109451	0.050608	-0.047279	0.183844
NOX	0.410971	-0.498619	0.738387	0.070867	1.000000	-0.302188	0.711864	-0.769230	0.611441	0.668023	0.188933	-0.380051	0.573040	-0.427321
RM	-0.220045	0.312295	-0.377978	0.106797	-0.302188	1.000000	-0.239518	0.205246	-0.209847	-0.292048	-0.355501	0.128069	-0.604323	0.695360
AGE	0.343427	-0.535341	0.614248	0.074984	0.711864	-0.239518	1.000000	-0.724354	0.447088	0.498408	0.261826	-0.268029	0.575022	-0.377572
DIS	-0.366025	0.632428	-0.698621	-0.092318	-0.769230	0.205246	-0.724354	1.000000	-0.494588	-0.534432	-0.232471	0.291512	-0.483244	0.249929
RAD	0.601224	-0.300061	0.592735	-0.003339	0.611441	-0.209847	0.447088	-0.494588	1.000000	0.910228	0.464741	-0.444413	0.467765	-0.381626
TAX	0.560469	-0.304385	0.716267	-0.035822	0.668023	-0.292048	0.498408	-0.534432	0.910228	1.000000	0.460853	-0.441808	0.524156	-0.468536
PTRATIO	0.277964	-0.394622	0.385366	-0.109451	0.188933	-0.355501	0.261826	-0.232471	0.464741	0.460853	1.000000	-0.177383	0.370727	-0.507787
B	-0.365336	0.170125	-0.354840	0.050608	-0.380051	0.128069	-0.268029	0.291512	-0.444413	-0.441808	-0.177383	1.000000	-0.370993	0.333461
LSTAT	0.437417	-0.398838	0.564508	-0.047279	0.573040	-0.604323	0.575022	-0.483244	0.467765	0.524156	0.370727	-0.370993	1.000000	-0.723093
MEDV	-0.383895	0.362292	-0.476394	0.183844	-0.427321	0.695360	-0.377572	0.249929	-0.381626	-0.468536	-0.507787	0.333461	-0.723093	1.000000

	Salary Cost of Reports (£)	FTE	Actual Pay Floor (£)	Actual Pay Ceiling (£)	Valid?
count	5.100000e+01	51.000000	51.000000	51.000000	51.0
mean	1.651590e+06	0.982157	22745.098039	23823.313725	1.0
std	2.656579e+06	0.101771	45114.451404	47133.322796	0.0
min	0.000000e+00	0.300000	0.000000	0.000000	1.0
25%	0.000000e+00	1.000000	0.000000	0.000000	1.0
50%	1.105736e+06	1.000000	0.000000	0.000000	1.0
75%	2.175629e+06	1.000000	0.000000	0.000000	1.0
max	1.672047e+07	1.000000	150000.000000	154999.000000	1.0