Packages: pandas, numpy, matplotlib, statsmodels
pandas: an open source library providing high-performance, easy-to-use data structures and data analysis tools for the Python programming language.
Matplotlib: a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms.
In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
# This makes it so that plots show up here in the notebook.
# You do not need it if you are not using a notebook.
%matplotlib inline
from IPython.display import Image
In [3]:
data = {'Course' : 'Intro to Big Data',
'Section' : '6',
'Names' : ['Ronnie', 'Jeff', 'Teddy', 'Jerry'],
'Group' : ['1', '2', '1', '2'],
'Year' : ['Junior'] * 2 + ['Senior'] * 2,
'Date' : pd.Timestamp('20160607'),
'Quiz' : np.array([20, 90, 60, 100], dtype='float64')}
df = pd.DataFrame(data)
df
Out[3]:
In [4]:
df = df.rename(index=str, columns={"Names": "Name"})
df
Out[4]:
In [ ]:
df.to_csv('data.csv', sep = ',', index = False) # if comma separated (csv)
df.to_csv('data.tsv', sep = '\t', index = False) # if tab separated (tsv)
df.to_csv('data.txt', sep = '\t', index = False) # you can also use sep = ',' as in csv files
In [ ]:
df.to_excel('data.xlsx', index_label='label')
In [5]:
df_csv = pd.read_csv('data.csv', sep = ',')
df_csv
Out[5]:
In [6]:
df_tsv = pd.read_csv('data.tsv', sep = '\t')
df_tsv
Out[6]:
In [7]:
with pd.ExcelFile('data.xlsx') as xlsx:
df_excel = pd.read_excel(xlsx, sheetname = 'Sheet1')
df_excel
### If there are multiple sheets to read from
# with pd.ExcelFile('data.xlsx') as xlsx:
# df_sheet1 = pd.read_excel(xlsx, sheetname = 'Sheet1')
# df_sheet2 = pd.read_excel(xlsx, sheetname = 'Sheet2')
Out[7]:
In [8]:
# First, create a new dataframe
new_data = {'Course' : 'Intro to Big Data',
'Section' : '6',
'Name' : ['Donald', 'Melania'],
'Group' : '5',
'Year' : ['Freshman', 'Sophomore'],
'Date' : pd.Timestamp('20160607'),
'Quiz' : np.array([5, 85], dtype='float64')}
df2 = pd.DataFrame(new_data)
df2
Out[8]:
In [9]:
# Append the new dataframe to the existing dataframe
df = df.append(df2, ignore_index=True)
df
Out[9]:
In [10]:
df = pd.concat([df, df2], axis = 0, ignore_index = True) # If axis = 1, then add column
df
Out[10]:
In [11]:
df['Assignment'] = np.array([45, 85, 50, 90, 10, 70, 10, 70], dtype='float64')
df
Out[11]:
In [14]:
df.drop(0)
Out[14]:
In [15]:
df = df.drop('Date', axis = 1)
# Note: axis = 1 denotes that we are referring to a column, not a row
df
Out[15]:
In [16]:
# First, in order to check whether there are any duplicates
df.duplicated()
Out[16]:
In [17]:
# If there are duplicates, then run the following code
df = df.drop_duplicates()
df
Out[17]:
In [18]:
term_project = {'Group' : ['1', '2', '3', '4'],
'Presentation': [80.0, 90., 100., 50.],
'Report' : np.array([60, 80, 70, 30], dtype='float64')}
df3 = pd.DataFrame(term_project)
df3
Out[18]:
In [19]:
df
Out[19]:
In [20]:
pd.merge(df, df3, on = 'Group')
Out[20]:
Q. How should we merge the data in order to keep The Donald and Melania?
Q. Which one of these should we set how as?
In [21]:
pd.merge(df, df3, how = 'left', on = 'Group')
Out[21]:
In [22]:
pd.merge(df, df3, how = 'right', on = 'Group')
Out[22]:
In [23]:
pd.merge(df, df3, how = 'outer', on = 'Group')
Out[23]:
In [24]:
df = pd.merge(df, df3, how = 'left', on = 'Group')
df
Out[24]:
In [25]:
nRows = 3 # The number of rows to show
df.head(nRows)
Out[25]:
In [26]:
df.tail(nRows)
Out[26]:
In [27]:
df.index
Out[27]:
In [28]:
df.columns
Out[28]:
In [29]:
df.values
Out[29]:
In [30]:
df.sort_values(by='Quiz') # Ascending order
Out[30]:
In [31]:
df.sort_values(by='Quiz', ascending=False) # Descending order
Out[31]:
In [32]:
df.sort_values(by='Report', ascending=False) # Descending order
Out[32]:
In [33]:
df.where(df['Assignment'] > 50)
Out[33]:
In [34]:
df['Name'].where(df['Assignment'] > 50)
Out[34]:
In [35]:
df['Name'].where(df['Assignment'] > 50).count()
Out[35]:
In [36]:
df['Assignment']
Out[36]:
In [37]:
df[0:3]
Out[37]:
In [38]:
df.loc[0,'Name']
Out[38]:
In [39]:
df.loc[0,['Assignment','Quiz']]
Out[39]:
In [40]:
df[df['Assignment'] > 50]
Out[40]:
In [41]:
df[df['Year'].isin(['Junior'])]
Out[41]:
In [42]:
df
Out[42]:
In [96]:
# pd.isnull(df)
df.isnull()
Out[96]:
$\rightarrow$ An easier way to check is to use
In [44]:
pd.isnull(df).sum()
Out[44]:
In [45]:
df.dropna(how='any', axis = 0)
Out[45]:
In [46]:
df.dropna(how='any', axis = 1)
Out[46]:
In [49]:
df.fillna(value = 0)
Out[49]:
In [48]:
df.loc[4,'Presentation'] = 30
df.loc[5,'Presentation'] = 20
df.loc[4,'Report'] = 60
df.loc[5,'Report'] = 70
df
Out[48]:
In [50]:
df
Out[50]:
In [51]:
df.describe()
Out[51]:
In [52]:
df.describe().round(2)
Out[52]:
In [53]:
df.mean().round(2)
Out[53]:
In [54]:
df.median().round(2)
Out[54]:
In [55]:
df['Report'].min().round(2)
Out[55]:
In [56]:
df['Report'].max().round(2)
Out[56]:
In [57]:
df.var().round(2)
Out[57]:
In [58]:
df.corr().round(2)
Out[58]:
In [59]:
df.groupby('Group').mean()
Out[59]:
In [60]:
df.groupby(['Group', 'Year']).mean()
Out[60]:
In [61]:
pd.pivot_table(df, values='Report', index=['Year'], columns=['Group'])
Out[61]:
In [62]:
df['log_Report'] = np.log(df['Report'])
df
Out[62]:
In [63]:
df['sqrt_Report'] = np.sqrt(df['Report'])
df
Out[63]:
How can we make a new column 'Total' which calculate the weighted sum and rank the students by 'Total' in descending order?
In [68]:
df['Total'] = 0.15 * df['Quiz'] + 0.2 * df['Assignment'] + 0.25 * df['Presentation'] + 0.4 * df['Report']
df.sort_values(by='Total', ascending=False)['Name']
Out[68]:
In [2]:
salaries = pd.read_csv('./MLB/Salaries.csv', sep = ',')
salaries.head(10)
In [70]:
with pd.ExcelFile('Batting.xlsx') as f:
batting = pd.read_excel(f, sheetname = 'Batting')
batting.tail(5)
Out[70]:
In [71]:
data = pd.merge(salaries, batting, how='left', on = ['player', 'year', 'team'])
data.head(7)
Out[71]:
In [72]:
pitching = pd.read_stata('./MLB/Pitching.dta')
pitching.tail(4)
Out[72]:
In [73]:
pitching = pitching.dropna(how='any', axis = 1)
pitching.head()
Out[73]:
In [74]:
data = pd.merge(data, pitching, how='left', on = ['player', 'year', 'team'])
data.head(5)
Out[74]:
In [97]:
# pd.isnull(data).sum()
data.isnull().sum()
Out[97]:
In [76]:
data = data.fillna(value = 0.)
pd.isnull(data).sum()
Out[76]:
In [77]:
basic = pd.read_csv('./MLB/Basic.csv', sep = ',')
data = pd.merge(data, basic, how='inner', on = ['player'])
data.head()
Out[77]:
In [78]:
teams = pd.read_csv('./MLB/Teams.csv', sep = ',')
data = pd.merge(data, teams, how='left', on = ['team', 'year'])
data.head()
Out[78]:
In [ ]:
data.to_csv('baseball.tsv', sep = '\t', index = False)
In [79]:
data['log_salary'] = np.log(data['salary'] + 1)
data.describe().round(2)
Out[79]:
In [80]:
data.describe().round(2)
Out[80]:
In [81]:
data.groupby('team').mean().round(2).head()
Out[81]:
In [82]:
data.groupby(['team', 'year']).mean().round(2).head(20)
Out[82]:
In [83]:
year_dummies = pd.get_dummies(data['year'], drop_first = True)
year_dummies.head()
Out[83]:
In [84]:
data = pd.concat([data, year_dummies], axis = 1)
data.head()
Out[84]:
In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
# This makes it so that plots show up here in the notebook.
# You do not need it if you are not using a notebook.
%matplotlib inline
data = pd.read_csv('baseball.tsv', sep='\t')
data.columns.values
Out[4]:
In [5]:
data['salary'].hist(bins=20)
Out[5]:
In [7]:
data['log_salary'] = np.log(data['salary'] + 1)
In [9]:
def log2(value):
result = np.log(value + 1)
return result
data['log_salary'] = data['salary'].apply(log2)
In [11]:
data['log_salary'].hist(bins=20)
Out[11]:
In [13]:
data.plot.scatter(x = 'batting_RBI',
y = 'salary',
title = 'Scatter plot',
figsize = (5, 4))
Out[13]:
In [14]:
#let's get average salary by year born
last = data[(data['year'] == 2014) & (data['birthYear'] > 1900)]
tmp = last.groupby(by='birthYear').mean()
#And make a line plot of it...
tmp.plot.line(y='log_salary')
Out[14]:
In [16]:
Out[16]:
In [17]:
#ax stands for "axis", we'll use this object to change more settings
#we can also specify the image size, in inches, right here with the figsize argument
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
#add a title to the chart
ax.set_title('Average salary by year born')
#label the axes
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
#set the ticks so they're not half years
#The range command makes a list of years starting in 1972 and counting by 2 up to (not including) 1993.
ax.set_xticks(range(1972, 1993, 2))
#show our plot
plt.show()
In [18]:
#group our data by rank
tmp = data.groupby(by='Rank').mean()
#plot the mean log salary by rank
ax = tmp['log_salary'].plot.bar(figsize=(10,8))
ax.set_title('Average pay by rank')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Rank')
#set the upper and lower limits of the y axis
ax.set_ylim(ymin=0, ymax=16)
#get the location of the bars (rectangles)
rects = ax.patches
#loop throgh each bar and label it with its value
for rect, label in zip(rects, tmp['log_salary']):
#find out the height of the bar on the image
height = rect.get_height()
l = '{:5.2f}'.format(label)
ax.text(rect.get_x() + rect.get_width()/2, height + 0.5, l, ha='center', va='bottom')
plt.show()
In [19]:
#find the standard deviation instead of the mean when we group by rank
tmp2 = data.groupby(by='Rank').std()
#add the errors to out plotting call
ax = tmp['log_salary'].plot.bar(yerr=tmp2['log_salary'], figsize=(10,8))
ax.set_title('Average pay by rank')
ax.set_ylabel('log(salery)')
ax.set_xlabel('Rank')
#set the upper and lower limits of the y axis
ax.set_ylim(ymin=0, ymax=18)
plt.show()
In [20]:
#make a boxplot of salaries, by rank
ax = data.boxplot(column='log_salary', by='Rank', figsize=(10,8))
ax.set_title('Pay by rank')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Rank')
plt.show()
In [25]:
with plt.xkcd():
data.log_salary.hist(bins=20)
In [26]:
with plt.xkcd():
tmp = last.groupby(by='birthYear').mean()
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
ax.set_title('Average salary, by year born')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
ax.set_xticks(range(1972, 1993, 2))
plt.show()
In [27]:
plt.style.available
Out[27]:
In [28]:
plt.style.use('fivethirtyeight')
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
ax.set_title('Average salary, by year born')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
ax.set_xticks(range(1972, 1993, 2))
plt.show()
In [29]:
plt.style.use('ggplot')
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
ax.set_title('Average salary, by year born')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
ax.set_xticks(range(1972, 1993, 2))
plt.show()
In [30]:
plt.style.use('classic')
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
ax.set_title('Average salary, by year born')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
ax.set_xticks(range(1972, 1993, 2))
plt.show()
In [31]:
plt.style.use('seaborn')
ax = tmp.plot.line(y='log_salary', figsize=(10,8))
ax.set_title('Average salary, by year born')
ax.set_ylabel('log(salary)')
ax.set_xlabel('Year born')
ax.set_xticks(range(1972, 1993, 2))
plt.show()
In [32]:
formula = 'log_salary ~ batting_RBI + ERA'
result_ols = smf.ols(formula = formula, data = data).fit()
print(result_ols.summary())
In [33]:
data['above_average'] = (data['log_salary'] > data['log_salary'].mean()).astype(float)
data['above_average'].head()
Out[33]:
In [34]:
formula = 'above_average ~ batting_RBI + ERA'
result_logit = smf.logit(formula = formula, data = data).fit()
print(result_logit.summary())
In [35]:
# Import the pacakage
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
In [36]:
fuzz.ratio("This is a test", "This is a test!")
Out[36]:
In [37]:
fuzz.partial_ratio("this is a test", "this is a test!")
Out[37]:
In [89]:
fuzz.ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
Out[89]:
In [90]:
fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
Out[90]:
In [91]:
fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
Out[91]:
In [92]:
fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
Out[92]:
In [93]:
choices = ["Atlanta Falcons", "New York Jets", "New York Giants", "Dallas Cowboys"]
process.extract("new york jets", choices, limit=2)
Out[93]:
In [94]:
process.extractOne("cowboys", choices)
Out[94]:
In [ ]: