This example is adapted from Wes McKinney's 'Python for Data Analysis' (http://amzn.to/1TIMjPe).


In [1]:
import numpy as np  
import pandas as pd   
import matplotlib.pyplot as plt 
import os
import sys

%matplotlib inline 
plt.style.use('bmh')
# make plots bigger than default always
plt.rcParams['figure.figsize']=(10,16)

In [35]:
plt.style.available


Out[35]:
['ggplot',
 'seaborn-whitegrid',
 'seaborn-colorblind',
 'dark_background',
 'grayscale',
 'seaborn-deep',
 'seaborn-ticks',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-bright',
 'seaborn-pastel',
 'classic',
 'bmh',
 'seaborn-paper',
 'seaborn-poster',
 'seaborn-muted',
 'seaborn-talk',
 'seaborn-white',
 'seaborn-notebook',
 'seaborn-dark-palette',
 'fivethirtyeight']

In [2]:
# set path to data
DATA_ROOT = '../data/names'

In [3]:
# read a text csv file into a pandas dataframe
yob_1880 = os.path.join(DATA_ROOT, 'yob1880.txt')
df_1880 = pd.read_table(yob_1880, header=None, sep = ',')

In [4]:
! ls ../data/names


yob1880.txt yob1903.txt yob1926.txt yob1949.txt yob1972.txt yob1995.txt
yob1881.txt yob1904.txt yob1927.txt yob1950.txt yob1973.txt yob1996.txt
yob1882.txt yob1905.txt yob1928.txt yob1951.txt yob1974.txt yob1997.txt
yob1883.txt yob1906.txt yob1929.txt yob1952.txt yob1975.txt yob1998.txt
yob1884.txt yob1907.txt yob1930.txt yob1953.txt yob1976.txt yob1999.txt
yob1885.txt yob1908.txt yob1931.txt yob1954.txt yob1977.txt yob2000.txt
yob1886.txt yob1909.txt yob1932.txt yob1955.txt yob1978.txt yob2001.txt
yob1887.txt yob1910.txt yob1933.txt yob1956.txt yob1979.txt yob2002.txt
yob1888.txt yob1911.txt yob1934.txt yob1957.txt yob1980.txt yob2003.txt
yob1889.txt yob1912.txt yob1935.txt yob1958.txt yob1981.txt yob2004.txt
yob1890.txt yob1913.txt yob1936.txt yob1959.txt yob1982.txt yob2005.txt
yob1891.txt yob1914.txt yob1937.txt yob1960.txt yob1983.txt yob2006.txt
yob1892.txt yob1915.txt yob1938.txt yob1961.txt yob1984.txt yob2007.txt
yob1893.txt yob1916.txt yob1939.txt yob1962.txt yob1985.txt yob2008.txt
yob1894.txt yob1917.txt yob1940.txt yob1963.txt yob1986.txt yob2009.txt
yob1895.txt yob1918.txt yob1941.txt yob1964.txt yob1987.txt yob2010.txt
yob1896.txt yob1919.txt yob1942.txt yob1965.txt yob1988.txt yob2011.txt
yob1897.txt yob1920.txt yob1943.txt yob1966.txt yob1989.txt yob2012.txt
yob1898.txt yob1921.txt yob1944.txt yob1967.txt yob1990.txt yob2013.txt
yob1899.txt yob1922.txt yob1945.txt yob1968.txt yob1991.txt
yob1900.txt yob1923.txt yob1946.txt yob1969.txt yob1992.txt
yob1901.txt yob1924.txt yob1947.txt yob1970.txt yob1993.txt
yob1902.txt yob1925.txt yob1948.txt yob1971.txt yob1994.txt

In [5]:
type(df_1880)


Out[5]:
pandas.core.frame.DataFrame

In [6]:
df_1880.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
0    2000 non-null object
1    2000 non-null object
2    2000 non-null int64
dtypes: int64(1), object(2)
memory usage: 47.0+ KB

In [7]:
df_1880.head()


Out[7]:
0 1 2
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746

In [8]:
df_1880.columns = ['name', 'sex', 'births']

In [9]:
df_1880.head(15)


Out[9]:
name sex births
0 Mary F 7065
1 Anna F 2604
2 Emma F 2003
3 Elizabeth F 1939
4 Minnie F 1746
5 Margaret F 1578
6 Ida F 1472
7 Alice F 1414
8 Bertha F 1320
9 Sarah F 1288
10 Annie F 1258
11 Clara F 1226
12 Ella F 1156
13 Florence F 1063
14 Cora F 1045

In [12]:
df_1880.describe(include='all')


Out[12]:
name sex births
count 2000 2000 2000.000000
unique 1889 2 NaN
top Alva M NaN
freq 2 1058 NaN
mean NaN NaN 100.742000
std NaN NaN 466.108732
min NaN NaN 5.000000
25% NaN NaN 7.000000
50% NaN NaN 13.000000
75% NaN NaN 41.250000
max NaN NaN 9655.000000

In [13]:
df_1880.name.value_counts()[:10]  # slice of output of value_counts function: it returns the first 10 entries


Out[13]:
Alva       2
Louie      2
Cora       2
Ida        2
Alma       2
Daisy      2
Frances    2
Lynn       2
Jesse      2
Louis      2
Name: name, dtype: int64

In [14]:
df_1880.loc[df_1880['name']=='Clara', :]  # ex. of a boolean selection criterion


Out[14]:
name sex births
11 Clara F 1226
1633 Clara M 8

In [15]:
del df_1880

In [17]:
df = pd.DataFrame(columns=['name', 'sex', 'births', 'year'])  # empty dataframe

for i in theList:
    tmp = pd.read_csv(os.path.join(DATA_ROOT, i), sep=',', header=None)  # read the next year's data
    tmp.columns = ['name', 'sex', 'births']
    tmp['year'] = int(i[3:7])  # create a new column containing the year, force to integer
    df = df.append(tmp, ignore_index=True)  # attach the next year's data to the dataframe

In [20]:
len(theList)


Out[20]:
134

In [18]:
df.shape


Out[18]:
(1792091, 4)

In [22]:
type(df.year[1])


Out[22]:
numpy.float64

In [19]:
df.tail()


Out[19]:
name sex births year
1792086 Zyhier M 5.0 2013.0
1792087 Zylar M 5.0 2013.0
1792088 Zymari M 5.0 2013.0
1792089 Zymeer M 5.0 2013.0
1792090 Zyree M 5.0 2013.0

In [32]:
# helper functions
def get_name_series(theName, theSex='F'):  # putting in a value for theSex gives the function a default value
    try:
        return df[(df.name == theName) & (df.sex == theSex)]
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)
    
def plot_name_series(name_series, label, axes):
    try:
        axes.plot(name_series.year, name_series.births, label=label)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)

In [29]:
def f(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return x * y

In [30]:
f(4,5)


Out[30]:
0.8

In [31]:
f(4,0)


Out[31]:
0

In [33]:
# create an empty plot of specified size - note that you do not need to specify size here
fig, ax = plt.subplots(figsize=(12, 8))

for name in ['Isabella', 'Colette', 'Zoe', 'Arabella', 'Gabriella', 'Elizabeth']:
    name_series = get_name_series(name)
    plot_name_series(name_series, name, ax)

# set some options
ax.legend(loc='upper left');
ax.set_title('Popularity of select names over time');
ax.set_ylabel('Number of Births');
ax.set_xlabel('Year');
ax.set_xlim(1900,2015);
plt.xticks(rotation=45);



In [36]:
total_births = df.pivot_table('births', index = 'year', columns = 'sex', aggfunc = sum)

In [37]:
total_births.tail()


Out[37]:
sex F M
year
2009.0 1832276.0 1978582.0
2010.0 1771846.0 1912915.0
2011.0 1752198.0 1891800.0
2012.0 1751866.0 1886972.0
2013.0 1736630.0 1871467.0

In [38]:
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(total_births.index,total_births['F'],label = 'Females')
ax.plot(total_births.index,total_births['M'], label = 'Males')

ax.legend(loc='best')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Births');



In [39]:
get_first_letter = lambda x: x[0] #a lambda function is a 'throwaway' function, used for next line
first_letters = df.name.map(get_first_letter) #this is more than five times faster than a 'for' loop
first_letters.name = 'first_letter'

table = df[df.sex == 'F'].pivot_table('births', index=first_letters, columns='year', aggfunc=sum)

In [40]:
for i in range(2000, 2014):
    print('%s: 1. %s 2. %s' %(i, table.ix[:,i].argmax(),  # argmax returns position with maximum value
           table.ix['B':,i].argmax()))                 # slicing the A's off (by index, NOT by position) to get #2


2000: 1. A 2. M
2001: 1. A 2. M
2002: 1. A 2. M
2003: 1. A 2. M
2004: 1. A 2. M
2005: 1. A 2. M
2006: 1. A 2. M
2007: 1. A 2. M
2008: 1. A 2. M
2009: 1. A 2. M
2010: 1. A 2. M
2011: 1. A 2. M
2012: 1. A 2. M
2013: 1. A 2. M

In [ ]: