In [57]:
import pandas as pd
pd.options.display.max_rows = 8
%matplotlib inline
In [58]:
x = 1
print(type(x))
In [59]:
y = 'Eric'
print(type(y))
In [60]:
z = True
print(type(z))
In [61]:
l = [1, 2, 3]
print(type(l))
In [62]:
print(3 + 1)
In [63]:
print(str(3) + str(1))
In [64]:
def print_type(var):
print(str(var) + ' is type' + str(type(var)))
Where:
def is the keyword for defining a methodprint_type is the methodvar is the argument
In [65]:
for var in [x,y,z,l]:
print_type(var)
In [66]:
def return_max(x,y):
if x > y:
return x
else:
return y
In [67]:
z = return_max(1,2) # now z is the max of 1 and 2
print(z)
In [68]:
df = pd.read_csv('Current_Employee_Names__Salaries__and_Position_Titles.csv')
df
Out[68]:
In [69]:
print(type(df))
A "kind" of object (like int or float or DataFrame) is called a class or a type. An instance or example of a type (1, 'Eric', df) are called objects.
Objects have members, including functions and attributes.
In [70]:
df.dropna(inplace=True)
df
Out[70]:
In [71]:
df.sort_values(by='Department')
Out[71]:
In [72]:
df.sort_values('Department', ascending=False)
Out[72]:
In addition to functions, objects have (data) attributes:
In [73]:
df.shape
Out[73]:
In [74]:
df.columns
Out[74]:
DataFrames are tables. They have rows and columns.
The columns are indexed by their names. We can list the columns using the attribute columns:
In [75]:
print(df.columns)
The rows are indexed by the attribute index:
In [76]:
print(df.index)
We saw last class that we can select a column from our tables as follows:
In [77]:
departments = df['Department']
print(departments)
A single column has its own type, called Series, which is like a column.
In [78]:
print(type(departments))
In [79]:
l = ['apple','banana', 'orange']
l[2]
Out[79]:
For lists the index is an integer, between 0 and len(list)-1.
In [80]:
d = {'a': 1, 'b': 2, 'c': 3}
d['a']
Out[80]:
We can also select multiple columns by indexing with a list of column names:
In [81]:
df[['Name', 'Department']]
Out[81]:
In [82]:
df[df.Department == 'POLICE']
Out[82]:
In [83]:
df['Department'] == 'POLICE'
Out[83]:
This is called a boolean series.
df. True or False)in this case the booleans indicate whether the Department column is equal to 'Police'. When we pass it as an index we are selecting the rows for which the series is True.
In [111]:
salaries = df['Employee Annual Salary'].copy()
salaries
Out[111]:
In [85]:
salary = salaries[0]
print_type(salary)
How to make this a number? Remove the first character and cast to float
In [86]:
t = float(salary[1:])
print_type(t)
Want to do this for every salary? Could just loop over and assign:
In [87]:
for i in range(len(salaries)):
salaries[i] = float(salaries[i][1:])
salaries
Out[87]:
In [112]:
s = 'Eric'
In [113]:
s[1:3]
Out[113]:
In [88]:
def salary_to_float(salary):
return float(salary[1:])
salaries = df['Employee Annual Salary'].copy()
for i in range(len(salaries)):
salaries[i] = salary_to_float(salaries[i])
salaries
Out[88]:
In [119]:
1+1
Out[119]:
In [89]:
df['Employee Annual Salary'].apply(salary_to_float)
Out[89]:
Finally we can reassign the original column to our converted column:
In [115]:
df['Employee Annual Salary Numeric'] = df['Employee Annual Salary'].apply(salary_to_float)
df
Out[115]:
In [126]:
salaries_numeric = df['Employee Annual Salary Numeric']
Dates and times are important for data analysis. Let's look again at the Chicago crime dataset.
In [129]:
df_crimes = pd.read_csv('Crimes_-_2001_to_present.csv',
parse_dates=['Date'])
df_crimes
Out[129]:
In [92]:
df_crimes['Date']
Out[92]:
In [93]:
date = df_crimes['Date'][0]
print_type(date)
In [94]:
print(date.year)
print(date.month)
print(date.day)
print(date.dayofweek)
In [95]:
def dayofweek(date):
return date.dayofweek
In [96]:
df_crimes['Date'].apply(dayofweek)
Out[96]:
In [97]:
df_crimes['DayOfWeek'] = df_crimes['Date'].apply(dayofweek)
In [98]:
df_crimes
Out[98]:
Given boolean series there are three basic logical operations: and (&), or (|) and not (~). We can use these on boolean series to do more complex queries of our table.
In [107]:
df['Employee Annual Salary Numeric'] > 100000
Out[107]:
In [106]:
~(df['Employee Annual Salary Numeric'] > 100000)
Out[106]:
In [130]:
df[~(df['Employee Annual Salary Numeric'] > 100000)]
Out[130]:
In [109]:
# Parentheses are essential for logical operations like below!
over100k_and_police = (df['Employee Annual Salary Numeric'] > 100000) &
(df['Department'] == 'POLICE')
print(over100k_and_police)
In [131]:
df[over100k_and_police]
Out[131]:
In [131]:
In [131]:
In [131]:
In [131]:
In [132]:
df
Out[132]:
In [138]:
salaries = df['Employee Annual Salary'].copy()
In [139]:
salaries[1] = 10
In [140]:
df
Out[140]:
In [141]:
isalaries=[]
salaries=df[']
Out[141]:
In [144]:
In [ ]: