In [1]:
# Import all libraries needed for the tutorial
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import sys #only needed to determine Python version number

# Enable inline plotting
%matplotlib inline

In [2]:
# The inital set of baby names
names = ['Bob','Jessica','Mary','John','Mel']

In [3]:
# This will ensure the random samples below can be reproduced. 
# This means the random samples will always be identical.

random.seed?

In [4]:
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]

# Print first 10 records
random_names[:10]


Out[4]:
['Mary',
 'Jessica',
 'Jessica',
 'Bob',
 'Jessica',
 'Jessica',
 'Jessica',
 'Mary',
 'Mary',
 'Mary']

In [5]:
# The number of births per name for the year 1880
births = [random.randint(low=0,high=1000) for i in range(1000)]
births[:10]


Out[5]:
[968, 155, 77, 578, 973, 124, 155, 403, 199, 191]

In [6]:
BabyDataSet = list(zip(random_names,births))
BabyDataSet[:10]


Out[6]:
[('Mary', 968),
 ('Jessica', 155),
 ('Jessica', 77),
 ('Bob', 578),
 ('Jessica', 973),
 ('Jessica', 124),
 ('Jessica', 155),
 ('Mary', 403),
 ('Mary', 199),
 ('Mary', 191)]

In [7]:
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df[:10]


Out[7]:
Names Births
0 Mary 968
1 Jessica 155
2 Jessica 77
3 Bob 578
4 Jessica 973
5 Jessica 124
6 Jessica 155
7 Mary 403
8 Mary 199
9 Mary 191

In [8]:
df.to_csv('births1880.txt',index=False,header=False)

In [10]:
location = 'births1880.txt'
df = pd.read_csv(location)

In [11]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 998
Data columns (total 2 columns):
Mary    999 non-null object
968     999 non-null int64
dtypes: int64(1), object(1)
memory usage: 23.4+ KB

In [12]:
df.head()


Out[12]:
Mary 968
0 Jessica 155
1 Jessica 77
2 Bob 578
3 Jessica 973
4 Jessica 124

In [14]:
df = pd.read_csv(location, header=None)
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 2 columns):
0    1000 non-null object
1    1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 23.4+ KB

In [15]:
df.tail()


Out[15]:
0 1
995 John 151
996 Jessica 511
997 John 756
998 Jessica 294
999 John 152

In [16]:
df = pd.read_csv(location, names=['Names','Births'])
df.head(5)


Out[16]:
Names Births
0 Mary 968
1 Jessica 155
2 Jessica 77
3 Bob 578
4 Jessica 973

In [17]:
import os
os.remove(location)

In [18]:
# Method 1:
df['Names'].unique()


Out[18]:
array(['Mary', 'Jessica', 'Bob', 'John', 'Mel'], dtype=object)

In [20]:
# If you actually want to print the unique values:
for x in df['Names'].unique():
    print(x)


Mary
Jessica
Bob
John
Mel

In [22]:
# Method 2:
print(df['Names'].describe())


count     1000
unique       5
top        Bob
freq       206
Name: Names, dtype: object

In [23]:
# Create a groupby object
name = df.groupby('Names')

# Apply the sum function to the groupby object
df = name.sum()
df


Out[23]:
Births
Names
Bob 106817
Jessica 97826
John 90705
Mary 99438
Mel 102319

In [24]:
# Method 1:
Sorted = df.sort(['Births'], ascending=False)
Sorted.head(1)


Out[24]:
Births
Names
Bob 106817

In [25]:
# Method 2:
df['Births'].max()


Out[25]:
106817

In [27]:
# Create graph
df['Births'].plot(kind='bar')

print("The most popular name")
df.sort(columns='Births', ascending=False)


The most popular name
Out[27]:
Births
Names
Bob 106817
Mel 102319
Mary 99438
Jessica 97826
John 90705

In [ ]: