In [1]:
# Import Libraries

import os
import re
import urllib2
from zipfile import ZipFile
import csv
import cPickle as pickle

In [2]:
def downloadNames():
    u = urllib2.urlopen('https://www.ssa.gov/oact/babynames/names.zip')
    localFile = open('names.zip', 'w')
    localFile.write(u.read())
    localFile.close()

In [3]:
def getNameList():
    if not os.path.exists('names.pickle'):
        print 'names.pickle does not exist, generating'
        
        # https://www.ssa.gov/oact/babynames/names.zip
        
        if not os.path.exists('names.zip'):
            print 'names.zip does not exist, downloading from github'
            downloadNames()
        else:
            print 'names.zip exists, not downloading'
        
        print 'Extracting names from names.zip'  
        
        namesDict=extractNamesDict()
        
        maleNames=list()
        femaleNames=list()
        
        print 'Sorting Names'
        
        for name in namesDict:
            counts=namesDict[name]
            tuple=(name,counts[0],counts[1])
            if counts[0]>counts[1]:
                maleNames.append(tuple)
            elif counts[1]>counts[0]:
                femaleNames.append(tuple)
        
        names=(maleNames,femaleNames)
        
        print 'Saving names.pickle'
        fw=open('names.pickle','wb')
        pickle.dump(names,fw,-1)
        fw.close()
        print 'Saved names.pickle'
    else:
        print 'names.pickle exists, loading data'
        f=open('names.pickle','rb')
        names=pickle.load(f)
        print 'names.pickle loaded'
        
    print '%d male names loaded, %d female names loaded'%(len(names[0]),len(names[1]))
    
    return names[0],names[1]

In [4]:
def extractNamesDict():
    zf=ZipFile('names.zip', 'r')
    filenames=zf.namelist()
    
    names=dict()
    genderMap={'M':0,'F':1}
    
    for filename in filenames:
        fp = zf.open(filename,'rU')
        rows=csv.reader(fp, delimiter=',', dialect=csv.excel_tab)
        try:
            for row in rows:
            #print name,row[1]
                try:
                    name=row[0].upper()
                    gender=genderMap[row[1]]
                    count=int(row[2])

                    if not names.has_key(name):
                        names[name]=[0,0]

                    names[name][gender]=names[name][gender]+count
                except:
                    pass
        except:
            pass
            
        fp.close()
        
        print '\tImported %s'%filename
    return names

In [5]:
male_names, female_names = getNameList()


names.pickle exists, loading data
names.pickle loaded
34722 male names loaded, 60185 female names loaded

In [9]:
male_names = sorted(male_names)

In [15]:
male_names[:10]


Out[15]:
[('AABAN', 87, 0),
 ('AABID', 5, 0),
 ('AADAM', 218, 0),
 ('AADAN', 122, 0),
 ('AADARSH', 173, 0),
 ('AADEN', 4213, 5),
 ('AADESH', 15, 0),
 ('AADHAV', 133, 0),
 ('AADHAVAN', 24, 0),
 ('AADHI', 23, 0)]

In [11]:
female_names = sorted(female_names)

In [13]:
female_names[:10]


Out[13]:
[('AABHA', 0, 28),
 ('AABRIELLA', 0, 15),
 ('AADA', 0, 5),
 ('AADHIRA', 0, 37),
 ('AADHYA', 0, 904),
 ('AADISON', 0, 11),
 ('AADITRI', 0, 26),
 ('AADRIKA', 0, 19),
 ('AADVIKA', 0, 20),
 ('AADYA', 0, 717)]

In [25]:
new_male_list = []
for index,name in enumerate(male_names):
    try:
        if (name[1]/name[2])>=4:
            new_male_list.append(name[0])
    except:
        new_male_list.append(name[0])
        
print "Total number of Male Names after is %d." %len(new_male_list)


Total number of Male Names after is 33120.

In [17]:
new_male_list[:10]


Out[17]:
['AABAN',
 'AABID',
 'AADAM',
 'AADAN',
 'AADARSH',
 'AADEN',
 'AADESH',
 'AADHAV',
 'AADHAVAN',
 'AADHI']

In [26]:
new_female_list = []
for index,name in enumerate(female_names):
    try:
        if (name[2]/name[1])>=4:
            new_female_list.append(name[0])
    except:
        new_female_list.append(name[0])
        
print "Total number of Female Names after is %d." %len(new_female_list)


Total number of Female Names after is 58801.

In [19]:
new_female_list[:10]


Out[19]:
['AABHA',
 'AABRIELLA',
 'AADA',
 'AADHIRA',
 'AADHYA',
 'AADISON',
 'AADITRI',
 'AADRIKA',
 'AADVIKA',
 'AADYA']

Find Gender from First Name


In [20]:
def find_gender_from_first_name(name):
    if name.upper() in new_male_list:
        print "Male"
    elif name.upper() in new_female_list:
        print "Female"
    else:
        print "Unknown"

Let's do some Gender Testing


In [21]:
find_gender_from_first_name('Harsh')


Male

In [22]:
find_gender_from_first_name('Marry')


Female

In [27]:
find_gender_from_first_name('Satya')


Unknown