In [1]:
# Import Libraries
import os
import re
import urllib2
from zipfile import ZipFile
import csv
import cPickle as pickle
In [2]:
def downloadNames():
u = urllib2.urlopen('https://www.ssa.gov/oact/babynames/names.zip')
localFile = open('names.zip', 'w')
localFile.write(u.read())
localFile.close()
In [3]:
def getNameList():
if not os.path.exists('names.pickle'):
print 'names.pickle does not exist, generating'
# https://www.ssa.gov/oact/babynames/names.zip
if not os.path.exists('names.zip'):
print 'names.zip does not exist, downloading from github'
downloadNames()
else:
print 'names.zip exists, not downloading'
print 'Extracting names from names.zip'
namesDict=extractNamesDict()
maleNames=list()
femaleNames=list()
print 'Sorting Names'
for name in namesDict:
counts=namesDict[name]
tuple=(name,counts[0],counts[1])
if counts[0]>counts[1]:
maleNames.append(tuple)
elif counts[1]>counts[0]:
femaleNames.append(tuple)
names=(maleNames,femaleNames)
print 'Saving names.pickle'
fw=open('names.pickle','wb')
pickle.dump(names,fw,-1)
fw.close()
print 'Saved names.pickle'
else:
print 'names.pickle exists, loading data'
f=open('names.pickle','rb')
names=pickle.load(f)
print 'names.pickle loaded'
print '%d male names loaded, %d female names loaded'%(len(names[0]),len(names[1]))
return names[0],names[1]
In [4]:
def extractNamesDict():
zf=ZipFile('names.zip', 'r')
filenames=zf.namelist()
names=dict()
genderMap={'M':0,'F':1}
for filename in filenames:
fp = zf.open(filename,'rU')
rows=csv.reader(fp, delimiter=',', dialect=csv.excel_tab)
try:
for row in rows:
#print name,row[1]
try:
name=row[0].upper()
gender=genderMap[row[1]]
count=int(row[2])
if not names.has_key(name):
names[name]=[0,0]
names[name][gender]=names[name][gender]+count
except:
pass
except:
pass
fp.close()
print '\tImported %s'%filename
return names
In [5]:
male_names, female_names = getNameList()
In [9]:
male_names = sorted(male_names)
In [15]:
male_names[:10]
Out[15]:
In [11]:
female_names = sorted(female_names)
In [13]:
female_names[:10]
Out[13]:
In [25]:
new_male_list = []
for index,name in enumerate(male_names):
try:
if (name[1]/name[2])>=4:
new_male_list.append(name[0])
except:
new_male_list.append(name[0])
print "Total number of Male Names after is %d." %len(new_male_list)
In [17]:
new_male_list[:10]
Out[17]:
In [26]:
new_female_list = []
for index,name in enumerate(female_names):
try:
if (name[2]/name[1])>=4:
new_female_list.append(name[0])
except:
new_female_list.append(name[0])
print "Total number of Female Names after is %d." %len(new_female_list)
In [19]:
new_female_list[:10]
Out[19]:
In [20]:
def find_gender_from_first_name(name):
if name.upper() in new_male_list:
print "Male"
elif name.upper() in new_female_list:
print "Female"
else:
print "Unknown"
In [21]:
find_gender_from_first_name('Harsh')
In [22]:
find_gender_from_first_name('Marry')
In [27]:
find_gender_from_first_name('Satya')