In [28]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
comp_data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
comp_df = pd.DataFrame(comp_data)
comp_df
Out[2]:
In [3]:
# find unique company names
comp_df['Company'].unique()
Out[3]:
In [4]:
comp_df['Company'].nunique()
Out[4]:
In [5]:
comp_df['Company'].value_counts()
Out[5]:
In [6]:
comp_df['sq_sales'] = comp_df['Sales'].apply(lambda x:x*x)
comp_df
Out[6]:
We can also define a function and call that within the apply()
method. This can accept values of one or more columns to calculate a new column.
In [9]:
def cuber(row):
return row['Sales'] * row['sq_sales']
comp_df['cu_sales'] = comp_df.apply(cuber, axis=1)
#note - how the function is called as an obj
# note - how I need to set axis to 1, instead of 0 which is defualt.
comp_df
Out[9]:
In [10]:
comp_df.sort_values('Sales')
Out[10]:
Note how the index remains attached to the original rows.
In [12]:
#sorting along multiple columns
comp_df.sort_values(['Company','Sales'])
Out[12]:
In [13]:
comp_df.isnull()
Out[13]:
In [23]:
registrant_df = pd.read_csv('./registrant.csv')
registrant_df.head()
Out[23]:
The Registration Date
should be of type datetime
and the Current customer?
should be of bool
. However, are they?
In [24]:
registrant_df.dtypes
Out[24]:
Everything is a generic object
. Let us re-read, this time knowing what their data types should be.
In [25]:
# define a function (lambda in this case) which will convert a column to bool depending on the
# value of the cell
convertor_fn = lambda x: x in ['Yes', 'yes', 'YES']
convertor_map = {'Current customer?': convertor_fn}
# re-read data
registrant_df2 = pd.read_csv('./registrant.csv',
parse_dates=['Registration Date'],
converters = convertor_map)
registrant_df2.dtypes
Out[25]:
In [26]:
registrant_df2.set_index('Registration Date', inplace=True)
registrant_df2.drop(axis=1, columns=['Unnamed: 0'], inplace=True) # drop bad column
registrant_df2.head()
Out[26]:
In [30]:
registrant_df2.sort_index(inplace=True)
In [31]:
registrant_df2['registration_count'] = range(1, len(registrant_df2)+1) # goes from 1 to 284
In [32]:
plt.figure(figsize=(10,7))
registrant_df2['registration_count'].plot(kind='line')
plt.title('Number of registrants over time');