notebook.community

Edit and run



In [1]:

    
# Import all libraries needed for the tutorial
import pandas as pd
from numpy import random
import matplotlib.pyplot as plt
import sys #only needed to determine Python version number

# Enable inline plotting
%matplotlib inline



In [2]:

    
# The inital set of baby names
names = ['Bob','Jessica','Mary','John','Mel']



In [3]:

    
# This will ensure the random samples below can be reproduced. 
# This means the random samples will always be identical.

random.seed?



In [4]:

    
random.seed(500)
random_names = [names[random.randint(low=0,high=len(names))] for i in range(1000)]

# Print first 10 records
random_names[:10]









    Out[4]:





['Mary',
 'Jessica',
 'Jessica',
 'Bob',
 'Jessica',
 'Jessica',
 'Jessica',
 'Mary',
 'Mary',
 'Mary']



In [5]:

    
# The number of births per name for the year 1880
births = [random.randint(low=0,high=1000) for i in range(1000)]
births[:10]









    Out[5]:





[968, 155, 77, 578, 973, 124, 155, 403, 199, 191]



In [6]:

    
BabyDataSet = list(zip(random_names,births))
BabyDataSet[:10]









    Out[6]:





[('Mary', 968),
 ('Jessica', 155),
 ('Jessica', 77),
 ('Bob', 578),
 ('Jessica', 973),
 ('Jessica', 124),
 ('Jessica', 155),
 ('Mary', 403),
 ('Mary', 199),
 ('Mary', 191)]



In [7]:

    
df = pd.DataFrame(data = BabyDataSet, columns=['Names', 'Births'])
df[:10]



In [8]:

    
df.to_csv('births1880.txt',index=False,header=False)



In [10]:

    
location = 'births1880.txt'
df = pd.read_csv(location)



In [11]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 999 entries, 0 to 998
Data columns (total 2 columns):
Mary    999 non-null object
968     999 non-null int64
dtypes: int64(1), object(1)
memory usage: 23.4+ KB



In [12]:

    
df.head()



In [14]:

    
df = pd.read_csv(location, header=None)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 0 to 999
Data columns (total 2 columns):
0    1000 non-null object
1    1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 23.4+ KB



In [15]:

    
df.tail()



In [16]:

    
df = pd.read_csv(location, names=['Names','Births'])
df.head(5)



In [17]:

    
import os
os.remove(location)



In [18]:

    
# Method 1:
df['Names'].unique()









    Out[18]:





array(['Mary', 'Jessica', 'Bob', 'John', 'Mel'], dtype=object)



In [20]:

    
# If you actually want to print the unique values:
for x in df['Names'].unique():
    print(x)









    



Mary
Jessica
Bob
John
Mel



In [22]:

    
# Method 2:
print(df['Names'].describe())









    



count     1000
unique       5
top        Bob
freq       206
Name: Names, dtype: object



In [23]:

    
# Create a groupby object
name = df.groupby('Names')

# Apply the sum function to the groupby object
df = name.sum()
df



In [24]:

    
# Method 1:
Sorted = df.sort(['Births'], ascending=False)
Sorted.head(1)



In [25]:

    
# Method 2:
df['Births'].max()









    Out[25]:





106817



In [27]:

    
# Create graph
df['Births'].plot(kind='bar')

print("The most popular name")
df.sort(columns='Births', ascending=False)









    



The most popular name






    Out[27]:






  
    
      
      Births
    
    
      Names
      
    
  
  
    
      Bob
      106817
    
    
      Mel
      102319
    
    
      Mary
      99438
    
    
      Jessica
      97826
    
    
      John
      90705



In [ ]:

	0	1
995	John	151
996	Jessica	511
997	John	756
998	Jessica	294
999	John	152

	Names	Births
0	Mary	968
1	Jessica	155
2	Jessica	77
3	Bob	578
4	Jessica	973
5	Jessica	124
6	Jessica	155
7	Mary	403
8	Mary	199
9	Mary	191