This example is adapted from Wes McKinney's 'Python for Data Analysis' (http://amzn.to/1TIMjPe).



In [1]:

    
import numpy as np  
import pandas as pd   
import matplotlib.pyplot as plt 
import os
import sys

%matplotlib inline 
plt.style.use('bmh')
# make plots bigger than default always
plt.rcParams['figure.figsize']=(10,16)



In [35]:

    
plt.style.available









    Out[35]:





['ggplot',
 'seaborn-whitegrid',
 'seaborn-colorblind',
 'dark_background',
 'grayscale',
 'seaborn-deep',
 'seaborn-ticks',
 'seaborn-dark',
 'seaborn-darkgrid',
 'seaborn-bright',
 'seaborn-pastel',
 'classic',
 'bmh',
 'seaborn-paper',
 'seaborn-poster',
 'seaborn-muted',
 'seaborn-talk',
 'seaborn-white',
 'seaborn-notebook',
 'seaborn-dark-palette',
 'fivethirtyeight']



In [2]:

    
# set path to data
DATA_ROOT = '../data/names'



In [3]:

    
# read a text csv file into a pandas dataframe
yob_1880 = os.path.join(DATA_ROOT, 'yob1880.txt')
df_1880 = pd.read_table(yob_1880, header=None, sep = ',')



In [4]:

    
! ls ../data/names









    



yob1880.txt yob1903.txt yob1926.txt yob1949.txt yob1972.txt yob1995.txt
yob1881.txt yob1904.txt yob1927.txt yob1950.txt yob1973.txt yob1996.txt
yob1882.txt yob1905.txt yob1928.txt yob1951.txt yob1974.txt yob1997.txt
yob1883.txt yob1906.txt yob1929.txt yob1952.txt yob1975.txt yob1998.txt
yob1884.txt yob1907.txt yob1930.txt yob1953.txt yob1976.txt yob1999.txt
yob1885.txt yob1908.txt yob1931.txt yob1954.txt yob1977.txt yob2000.txt
yob1886.txt yob1909.txt yob1932.txt yob1955.txt yob1978.txt yob2001.txt
yob1887.txt yob1910.txt yob1933.txt yob1956.txt yob1979.txt yob2002.txt
yob1888.txt yob1911.txt yob1934.txt yob1957.txt yob1980.txt yob2003.txt
yob1889.txt yob1912.txt yob1935.txt yob1958.txt yob1981.txt yob2004.txt
yob1890.txt yob1913.txt yob1936.txt yob1959.txt yob1982.txt yob2005.txt
yob1891.txt yob1914.txt yob1937.txt yob1960.txt yob1983.txt yob2006.txt
yob1892.txt yob1915.txt yob1938.txt yob1961.txt yob1984.txt yob2007.txt
yob1893.txt yob1916.txt yob1939.txt yob1962.txt yob1985.txt yob2008.txt
yob1894.txt yob1917.txt yob1940.txt yob1963.txt yob1986.txt yob2009.txt
yob1895.txt yob1918.txt yob1941.txt yob1964.txt yob1987.txt yob2010.txt
yob1896.txt yob1919.txt yob1942.txt yob1965.txt yob1988.txt yob2011.txt
yob1897.txt yob1920.txt yob1943.txt yob1966.txt yob1989.txt yob2012.txt
yob1898.txt yob1921.txt yob1944.txt yob1967.txt yob1990.txt yob2013.txt
yob1899.txt yob1922.txt yob1945.txt yob1968.txt yob1991.txt
yob1900.txt yob1923.txt yob1946.txt yob1969.txt yob1992.txt
yob1901.txt yob1924.txt yob1947.txt yob1970.txt yob1993.txt
yob1902.txt yob1925.txt yob1948.txt yob1971.txt yob1994.txt



In [5]:

    
type(df_1880)









    Out[5]:





pandas.core.frame.DataFrame



In [6]:

    
df_1880.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 3 columns):
0    2000 non-null object
1    2000 non-null object
2    2000 non-null int64
dtypes: int64(1), object(2)
memory usage: 47.0+ KB



In [7]:

    
df_1880.head()



In [8]:

    
df_1880.columns = ['name', 'sex', 'births']



In [9]:

    
df_1880.head(15)



In [12]:

    
df_1880.describe(include='all')









    Out[12]:






  
    
      
      name
      sex
      births
    
  
  
    
      count
      2000
      2000
      2000.000000
    
    
      unique
      1889
      2
      NaN
    
    
      top
      Alva
      M
      NaN
    
    
      freq
      2
      1058
      NaN
    
    
      mean
      NaN
      NaN
      100.742000
    
    
      std
      NaN
      NaN
      466.108732
    
    
      min
      NaN
      NaN
      5.000000
    
    
      25%
      NaN
      NaN
      7.000000
    
    
      50%
      NaN
      NaN
      13.000000
    
    
      75%
      NaN
      NaN
      41.250000
    
    
      max
      NaN
      NaN
      9655.000000



In [13]:

    
df_1880.name.value_counts()[:10]  # slice of output of value_counts function: it returns the first 10 entries









    Out[13]:





Alva       2
Louie      2
Cora       2
Ida        2
Alma       2
Daisy      2
Frances    2
Lynn       2
Jesse      2
Louis      2
Name: name, dtype: int64



In [14]:

    
df_1880.loc[df_1880['name']=='Clara', :]  # ex. of a boolean selection criterion



In [15]:

    
del df_1880



In [17]:

    
df = pd.DataFrame(columns=['name', 'sex', 'births', 'year'])  # empty dataframe

for i in theList:
    tmp = pd.read_csv(os.path.join(DATA_ROOT, i), sep=',', header=None)  # read the next year's data
    tmp.columns = ['name', 'sex', 'births']
    tmp['year'] = int(i[3:7])  # create a new column containing the year, force to integer
    df = df.append(tmp, ignore_index=True)  # attach the next year's data to the dataframe



In [20]:

    
len(theList)









    Out[20]:





134



In [18]:

    
df.shape









    Out[18]:





(1792091, 4)



In [22]:

    
type(df.year[1])









    Out[22]:





numpy.float64

This wouldn't fit in Excel, where max rows is 1,048,576: https://support.office.com/en-us/article/Excel-specifications-and-limits-1672b34d-7043-467e-8e27-269d656771c3.



In [19]:

    
df.tail()



In [32]:

    
# helper functions
def get_name_series(theName, theSex='F'):  # putting in a value for theSex gives the function a default value
    try:
        return df[(df.name == theName) & (df.sex == theSex)]
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)
    
def plot_name_series(name_series, label, axes):
    try:
        axes.plot(name_series.year, name_series.births, label=label)
    except:
        e = sys.exc_info()[0]
        print("Error: %s" % e)



In [29]:

    
def f(x, y):
    try:
        return x / y
    except ZeroDivisionError:
        return x * y



In [30]:

    
f(4,5)









    Out[30]:





0.8



In [31]:

    
f(4,0)









    Out[31]:





0



In [33]:

    
# create an empty plot of specified size - note that you do not need to specify size here
fig, ax = plt.subplots(figsize=(12, 8))

for name in ['Isabella', 'Colette', 'Zoe', 'Arabella', 'Gabriella', 'Elizabeth']:
    name_series = get_name_series(name)
    plot_name_series(name_series, name, ax)

# set some options
ax.legend(loc='upper left');
ax.set_title('Popularity of select names over time');
ax.set_ylabel('Number of Births');
ax.set_xlabel('Year');
ax.set_xlim(1900,2015);
plt.xticks(rotation=45);



In [36]:

    
total_births = df.pivot_table('births', index = 'year', columns = 'sex', aggfunc = sum)



In [37]:

    
total_births.tail()



In [38]:

    
fig, ax = plt.subplots(figsize=(8,5))
ax.plot(total_births.index,total_births['F'],label = 'Females')
ax.plot(total_births.index,total_births['M'], label = 'Males')

ax.legend(loc='best')
ax.set_xlabel('Year')
ax.set_ylabel('Number of Births');



In [39]:

    
get_first_letter = lambda x: x[0] #a lambda function is a 'throwaway' function, used for next line
first_letters = df.name.map(get_first_letter) #this is more than five times faster than a 'for' loop
first_letters.name = 'first_letter'

table = df[df.sex == 'F'].pivot_table('births', index=first_letters, columns='year', aggfunc=sum)



In [40]:

    
for i in range(2000, 2014):
    print('%s: 1. %s 2. %s' %(i, table.ix[:,i].argmax(),  # argmax returns position with maximum value
           table.ix['B':,i].argmax()))                 # slicing the A's off (by index, NOT by position) to get #2









    



2000: 1. A 2. M
2001: 1. A 2. M
2002: 1. A 2. M
2003: 1. A 2. M
2004: 1. A 2. M
2005: 1. A 2. M
2006: 1. A 2. M
2007: 1. A 2. M
2008: 1. A 2. M
2009: 1. A 2. M
2010: 1. A 2. M
2011: 1. A 2. M
2012: 1. A 2. M
2013: 1. A 2. M



In [ ]:

	0	1	2
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746

	name	sex	births
0	Mary	F	7065
1	Anna	F	2604
2	Emma	F	2003
3	Elizabeth	F	1939
4	Minnie	F	1746
5	Margaret	F	1578
6	Ida	F	1472
7	Alice	F	1414
8	Bertha	F	1320
9	Sarah	F	1288
10	Annie	F	1258
11	Clara	F	1226
12	Ella	F	1156
13	Florence	F	1063
14	Cora	F	1045

	name	sex	births	year
1792086	Zyhier	M	5.0	2013.0
1792087	Zylar	M	5.0	2013.0
1792088	Zymari	M	5.0	2013.0
1792089	Zymeer	M	5.0	2013.0
1792090	Zyree	M	5.0	2013.0

sex	F	M
year
2009.0	1832276.0	1978582.0
2010.0	1771846.0	1912915.0
2011.0	1752198.0	1891800.0
2012.0	1751866.0	1886972.0
2013.0	1736630.0	1871467.0

	name	sex	births
count	2000	2000	2000.000000
unique	1889	2	NaN
top	Alva	M	NaN
freq	2	1058	NaN
mean	NaN	NaN	100.742000
std	NaN	NaN	466.108732
min	NaN	NaN	5.000000
25%	NaN	NaN	7.000000
50%	NaN	NaN	13.000000
75%	NaN	NaN	41.250000
max	NaN	NaN	9655.000000