This example is adapted from Wes McKinney's 'Python for Data Analysis' (http://amzn.to/1TIMjPe).
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import sys
%matplotlib inline
plt.style.use('bmh')
# make plots bigger than default always
plt.rcParams['figure.figsize']=(10,16)
In [35]:
plt.style.available
Out[35]:
In [2]:
# set path to data
DATA_ROOT = '../data/names'
In [3]:
# read a text csv file into a pandas dataframe
yob_1880 = os.path.join(DATA_ROOT, 'yob1880.txt')
df_1880 = pd.read_table(yob_1880, header=None, sep = ',')
In [4]:
! ls ../data/names
In [5]:
type(df_1880)
Out[5]:
In [6]:
df_1880.info()
In [7]:
df_1880.head()
Out[7]:
In [8]:
df_1880.columns = ['name', 'sex', 'births']
In [9]:
df_1880.head(15)
Out[9]:
In [12]:
df_1880.describe(include='all')
Out[12]:
In [13]:
df_1880.name.value_counts()[:10] # slice of output of value_counts function: it returns the first 10 entries
Out[13]:
In [14]:
df_1880.loc[df_1880['name']=='Clara', :] # ex. of a boolean selection criterion
Out[14]:
In [15]:
del df_1880
In [17]:
df = pd.DataFrame(columns=['name', 'sex', 'births', 'year']) # empty dataframe
for i in theList:
tmp = pd.read_csv(os.path.join(DATA_ROOT, i), sep=',', header=None) # read the next year's data
tmp.columns = ['name', 'sex', 'births']
tmp['year'] = int(i[3:7]) # create a new column containing the year, force to integer
df = df.append(tmp, ignore_index=True) # attach the next year's data to the dataframe
In [20]:
len(theList)
Out[20]:
In [18]:
df.shape
Out[18]:
In [22]:
type(df.year[1])
Out[22]:
This wouldn't fit in Excel, where max rows is 1,048,576: https://support.office.com/en-us/article/Excel-specifications-and-limits-1672b34d-7043-467e-8e27-269d656771c3.
In [19]:
df.tail()
Out[19]:
In [32]:
# helper functions
def get_name_series(theName, theSex='F'): # putting in a value for theSex gives the function a default value
try:
return df[(df.name == theName) & (df.sex == theSex)]
except:
e = sys.exc_info()[0]
print("Error: %s" % e)
def plot_name_series(name_series, label, axes):
try:
axes.plot(name_series.year, name_series.births, label=label)
except:
e = sys.exc_info()[0]
print("Error: %s" % e)
In [29]:
def f(x, y):
try:
return x / y
except ZeroDivisionError:
return x * y
In [30]:
f(4,5)
Out[30]:
In [31]:
f(4,0)
Out[31]:
In [33]:
# create an empty plot of specified size - note that you do not need to specify size here
fig, ax = plt.subplots(figsize=(12, 8))
for name in ['Isabella', 'Colette', 'Zoe', 'Arabella', 'Gabriella', 'Elizabeth']:
name_series = get_name_series(name)
plot_name_series(name_series, name, ax)
# set some options
ax.legend(loc='upper left');
ax.set_title('Popularity of select names over time');
ax.set_ylabel('Number of Births');
ax.set_xlabel('Year');
ax.set_xlim(1900,2015);
plt.xticks(rotation=45);
In [36]:
total_births = df.pivot_table('births', index = 'year', columns = 'sex', aggfunc = sum)
In [37]:
total_births.tail()
Out[37]:
In [38]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(total_births.index,total_births['F'],label = 'Females')
ax.plot(total_births.index,total_births['M'], label = 'Males')
ax.legend(loc='best')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Births');
In [39]:
get_first_letter = lambda x: x[0] #a lambda function is a 'throwaway' function, used for next line
first_letters = df.name.map(get_first_letter) #this is more than five times faster than a 'for' loop
first_letters.name = 'first_letter'
table = df[df.sex == 'F'].pivot_table('births', index=first_letters, columns='year', aggfunc=sum)
In [40]:
for i in range(2000, 2014):
print('%s: 1. %s 2. %s' %(i, table.ix[:,i].argmax(), # argmax returns position with maximum value
table.ix['B':,i].argmax())) # slicing the A's off (by index, NOT by position) to get #2
In [ ]: