In [1]:
import mechanize
from random import randrange
from bs4 import BeautifulSoup
import time
alphabet = 'abcdefghijklmnopqrstuvwxyz'
bigrams = []
for i in range(12,26):
   for j in range(26):
           bigrams.append(alphabet[i]+alphabet[j])
            
names = []

In [20]:
def extract(html, bigram):
      global names
      soup = BeautifulSoup(html)
      trlist = []
      for tr in soup.find_all('tr'):
          if str(tr)[:18] == '<tr><td class=\"tan':
              trlist.append(tr)
      for item in trlist:
          if str(item)[21:24].lower() == bigram:
              names.append(item)

In [3]:
br = mechanize.Browser()
#br.set_all_readonly(False)    # allow everything to be written to
br.set_handle_robots(False)   # ignore robots
br.set_handle_refresh(False)  # can sometimes hang without this
br.addheaders = [('User-agent', 'Chrome')]

In [4]:
#

problems = []

for bigram in bigrams:
    response = br.open("http://incompetech.com/named/")
    time.sleep(randrange(5)*5)
    print bigram,
    thispage = 1
    br.form = list(br.forms())[0]  # use when form is unnamed
    namecontrol = br.form.find_control("name")
    namecontrol.value = bigram
    formresponse = br.submit()
    time.sleep(randrange(5)*3)
    html = formresponse.read()
    #extract(html, bigram)
    if html.find("You can narrow your search in the following ways") > -1:
        problems.append(bigram)
    else:
#         stopit = False
#         while stopit == False:
#             thispage += 1
#             linknames = []
#             for link in br.links():
#                 linknames.append(link.text)
#             if str(thispage) in linknames:
#                 for link in br.links():
#                     if link.text == str(thispage):
#                         request = br.click_link(link)
#                         response = br.follow_link(link)
#                         html = response.read()
#                         extract(html, bigram)
#             else:
#                 stopit = True
                
        pass
    br.back()
    
print problems


ma mb mc md me mf mg mh mi mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw nx ny nz oa ob oc od oe of og oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc pd pe pf pg ph pi pj pk pl pm pn po pp pq pr ps pt pu pv pw px py pz qa qb qc qd qe qf qg qh qi qj qk ql qm qn qo qp qq qr qs qt qu qv qw qx qy qz ra rb rc rd re rf rg rh ri rj rk rl rm rn ro rp rq rr rs rt ru rv rw rx ry rz sa sb sc sd se sf sg sh si sj sk sl sm sn so sp sq sr ss st su sv sw sx sy sz ta tb tc td te tf tg th ti tj tk tl tm tn to tp tq tr ts tt tu tv tw tx ty tz ua ub uc ud ue uf ug uh ui uj uk ul um un uo up uq ur us ut uu uv uw ux uy uz va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz wa wb wc wd we wf wg wh wi wj wk wl wm wn wo wp wq wr ws wt wu wv ww wx wy wz xa xb xc xd xe xf xg xh xi xj xk xl xm xn xo xp xq xr xs xt xu xv xw xx xy xz ya yb yc yd ye yf yg yh yi yj yk yl ym yn yo yp yq yr ys yt yu yv yw yx yy yz za zb zc zd ze zf zg zh zi zj zk zl zm zn zo zp zq zr zs zt zu zv zw zx zy zz ['ma', 'na', 'on', 'ra']

In [11]:
soup = BeautifulSoup(html)
trlist = []
for tr in soup.find_all('tr'):
  if str(tr)[:18] == '<tr><td class=\"tan':
      trlist.append(tr)
# for item in trlist:
#   if str(item)[21:24].lower() == bigram:
#       names.append(item)

In [19]:
str(soup.find_all('tr')[10])[:18]


Out[19]:
'<tr><td class="tan'

In [5]:
print problems
toredo = ['al', 'an', 'ar', 'el', 'en', 'ha', 'ia', 'in', 'la', 'ma', 'na', 'on', 'ra']
trigrams = []
for problem in toredo:
    for ltr in alphabet:
        trigrams.append(problem+ltr)


['ma', 'na', 'on', 'ra']

In [21]:
moreproblems = []
names = []

for trigram in trigrams:
    response = br.open("http://incompetech.com/named/")
    print trigram,
    time.sleep(randrange(5)*3+1)
    thispage = 1
    br.form = list(br.forms())[0]  # use when form is unnamed
    namecontrol = br.form.find_control("name")
    namecontrol.value = trigram
    formresponse = br.submit()
    html = formresponse.read()
    extract(html, trigram)
    if html.find("You can narrow your search in the following ways") > -1:
        moreproblems.append(trigram)
    else:
        stopit = False
        while stopit == False:
            thispage += 1
            linknames = []
            for link in br.links():
                linknames.append(link.text)
            if str(thispage) in linknames:
                for link in br.links():
                    if link.text == str(thispage):
                        request = br.click_link(link)
                        response = br.follow_link(link)
                        html = response.read()
                        extract(html, trigram)
            else:
                stopit = True
                
        time.sleep(randrange(4)*2+1)
    #br.back()
print " "
print moreproblems
with open('incompetechtri.txt', 'w+') as f:
  for name in names:
        f.write(str(name)+'\n')


ala alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq alr als alt alu alv alw alx aly alz ana anb anc and ane anf ang anh ani anj ank anl anm ann ano anp anq anr ans ant anu anv anw anx any anz ara arb arc ard are arf arg arh ari arj ark arl arm arn aro arp arq arr ars art aru arv arw arx ary arz ela elb elc eld ele elf elg elh eli elj elk ell elm eln elo elp elq elr els elt elu elv elw elx ely elz ena enb enc end ene enf eng enh eni enj enk enl enm enn eno enp enq enr ens ent enu env enw enx eny enz haa hab hac had hae haf hag hah hai haj hak hal ham han hao hap haq har has hat hau hav haw hax hay haz iaa iab iac iad iae iaf iag iah iai iaj iak ial iam ian iao iap iaq iar ias iat iau iav iaw iax iay iaz ina inb inc ind ine inf ing inh ini inj ink inl inm inn ino inp inq inr ins int inu inv inw inx iny inz laa lab lac lad lae laf lag lah lai laj lak lal lam lan lao lap laq lar las lat lau lav law lax lay laz maa mab mac mad mae maf mag mah mai maj mak mal mam man mao map maq mar mas mat mau mav maw max may maz naa nab nac nad nae naf nag nah nai naj nak nal nam nan nao nap naq nar nas nat nau nav naw nax nay naz ona onb onc ond one onf ong onh oni onj onk onl onm onn ono onp onq onr ons ont onu onv onw onx ony onz raa rab rac rad rae raf rag rah rai raj rak ral ram ran rao rap raq rar ras rat rau rav raw rax ray raz  
[]

In [14]:
with open('incompetech.txt', 'w+') as f:
  for name in names:
        f.write(str(name)+'\n')

In [1]:
import pandas as pd
df_meanings = pd.DataFrame()
df_variants = pd.DataFrame()
error_catcher = []

with open('incompetechall.txt', 'r+') as f:
    for line in f.readlines():
        try:
            ls = line.strip().split('\t')
            name = ls[0]
            sex = ls[2]
            if sex == 'Male':
                sex = 'M'
            if sex == 'Female':
                sex = 'F'
            df_meanings = df_meanings.append(pd.DataFrame({'name':[name], 'origin':[ls[1]], 'sex':[sex], 'meaning':[ls[3]]}), ignore_index=True)
            if len(ls)>4:
                vars = ls[4].split(', ')
                for var in vars:
                    df_variants = df_variants.append(pd.DataFrame({'name':[name], 'sex':[sex], 'variant':[var]}), ignore_index=True)
        
        except:
            error_catcher.append(line)
        
                
                
print error_catcher

df_meanings.reset_index(drop=True, inplace=True)
df_variants.reset_index(drop=True, inplace=True)

df_meanings.to_pickle('df_meaning.pickle')
df_variants.to_pickle('df_variants.pickle')


['Kafu\tTongan\tCovering\n', 'Kawana\tAustralian-Aboriginal\tFemale\n', 'Ligongo\tAfrican-Male\tWho is he?\n', 'Trutta\tAustralian-Aboriginal\tFemale\n', 'Wanikiya\tNAm.-Indian---Lakota\tMale\t\n', 'Lakkari\tAustralian-Aboriginal\tHoneysuckle tree\n']

In [1]:
import pandas as pd

In [11]:
df = pd.read_pickle('df_meaning.pickle')

In [7]:


In [26]:
df2 = pd.DataFrame()
for idx, row in df.iterrows():
    if df.sex[idx] == 'Male/Female' or df.sex[idx] == 'Both':
        n = df.name[idx]
        o = df.origin[idx]
        m = df.meaning[idx]
        df.sex[idx] = 'M'
        df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'origin':[o],'meaning':[m]}), ignore_index = True)

df.append(df2, ignore_index=True)
df.reset_index(drop=True,inplace=True)

In [27]:
df2 = df[(df.sex != 'F') & (df.sex != 'M')]
df2


Out[27]:
meaning name origin sex
5042 Kerklan, Kirklan Kirkland Old-English-Male From the church land
6454 God will hear Osahar African Male
7016 Rhet Rhett Old-English Mighty, strong
8955 Wahad, Wahib Wahhab Arabic Presenter, giver
9675 Female Alika African - Nigerian
9942 Arkel Arkell Old-Norse Eagle cauldron
10071 Elis Ellis Hebrew The Lord is my God
10487 Lemon Lamani Tongan Male
10815 Male Mansa African - Swahili
10845 Male Marama Polynesian
10943 Massie Massey Old-English Twin
11051 Lookout Nain Australian-Aboriginal Male
11123 Nerela, Narele Narelle Australian-Aboriginal Woman from the sea

In [38]:
dfv = pd.read_pickle('df_variants.pickle')

In [33]:
df.meaning.ix[5042] = 'From the church land'
df.origin.ix[5042] = 'Old-English'
df.sex.ix[5042] = 'M'
df.sex.ix[6454] = 'M'
df.meaning.ix[7016] = 'Mighty, strong'
df.sex.ix[7016] = 'M'
df.meaning.ix[8955] = 'Presenter, giver'
df.sex.ix[8955] = 'M'
df.meaning.ix[9675] = 'Most beautiful'
df.origin.ix[9675] = 'African---Nigerian'
df.sex.ix[9675] = 'F'
dfv = dfv[dfv.name != 'Alika']
df.meaning.ix[9942] = 'Eagle cauldron'
df.sex.ix[9942] = 'M'

df.meaning.ix[10071] = 'The Lord is my God'
df.sex.ix[10071] = 'M'

df.sex.ix[10487] = 'M'

df.meaning.ix[10815] = 'King'
df.origin.ix[10815] = 'African---Swahili'
df.sex.ix[10815] = 'M'
dfv = dfv[dfv.name != 'Mansa']

df.meaning.ix[10845] = 'Moon'
df.origin.ix[10845] = 'Polynesian'
df.sex.ix[10845] = 'M'
dfv = dfv[dfv.name != 'Marama']

df.meaning.ix[10943] = 'Twin'

In [40]:
dv = {'name': ['Kirkland', 'Kirkland', 'Rhett', 'Wahhab', 'Wahhab', 'Arkell', 'Ellis', 'Massey', 'Narelle', 'Narelle'],
      'variant': ['Kerklan', 'Kirklan', 'Rhet', 'Wahad', 'Wahib', 'Arkel', 'Elis', 'Massie', 'Nerela', 'Narele'],
	  'sex': ['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F']}

In [41]:
dfv=dfv.append(pd.DataFrame(dv))

In [42]:
dfv.tail(50)


Out[42]:
name sex variant
16444 Raymond M Ramon
16445 Raymond M Ramond
16446 Raymond M Ramonde
16447 Raymond M Ramone
16448 Raymond M Ray
16449 Raymond M Rayment
16450 Raymond M Reamonn (Irish)
16451 Raymond M Redmond
16452 Raymond M Reymond
16453 Raymond M Reymondo
16454 Raymonde F Raemond
16455 Raymonde F Raemonda
16456 Raymonde F Raimond
16457 Raymonde F Raimonda
16458 Raymonde F Raimonde
16459 Raymonde F Rayma
16460 Raymonde F Raymonda
16461 Rayna F Raen
16462 Rayna F Raena
16463 Rayna F Rain
16464 Rayna F Raina
16465 Rayna F Raine
16466 Rayna F Reina
16467 Rayna F Reyna
16468 Raynor M Ragnar
16469 Raynor M Rainar
16470 Raynor M Rainer
16471 Raynor M Rainier
16472 Raynor M Rainor
16473 Raynor M Ranieri
16474 Raynor M Raynar
16475 Raynor M Rayner
16476 Razi F Razilee
16477 Razi F Razili
16478 Raziah F Razia
16479 Raziah F Raziela
16480 Raziah F Razilee
16481 Raziah F Razili
16482 Raziah F Raziya
16483 Razzaq M Razaq
0 Kirkland M Kerklan
1 Kirkland M Kirklan
2 Rhett M Rhet
3 Wahhab M Wahad
4 Wahhab M Wahib
5 Arkell M Arkel
6 Ellis M Elis
7 Massey M Massie
8 Narelle F Nerela
9 Narelle F Narele

In [44]:
df2 = pd.DataFrame()
dfv.reset_index(drop=True, inplace=True)
for idx, row in dfv.iterrows():
    if dfv.sex[idx] == 'Male/Female' or dfv.sex[idx] == 'Both':
        n = dfv.name[idx]
        dfv.sex[idx] = 'M'
        v = dfv.variant[idx]
        df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'variant':[v]}), ignore_index = True)

dfv.append(df2, ignore_index=True)
dfv.reset_index(drop=True,inplace=True)

In [45]:
dfv.tail()


Out[45]:
name sex variant
16489 Arkell M Arkel
16490 Ellis M Elis
16491 Massey M Massie
16492 Narelle F Nerela
16493 Narelle F Narele

In [46]:
dfv.sex.unique()


Out[46]:
array(['F', 'M', '- Nigerian', ' Male', '- Swahili', 'Polynesian'], dtype=object)

In [2]:
df.to_pickle('df_meaning_incompetech.pickle')
dfv.to_pickle('df_variants_incompetech.pickle')


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a18cc88d4b04> in <module>()
----> 1 df.to_pickle('df_meaning_incompetech.pickle')
      2 dfv.to_pickle('df_variants_ict.pickle')

NameError: name 'df' is not defined

Cleanup variants - startpoint


In [4]:
import pandas as pd
df = pd.read_pickle('df_variants_ict.pickle')

In [6]:
df[df.variant.str.contains("\(")].head(10)


Out[6]:
name sex variant
88 Abelia F Abelle (feminine of Abel)
101 Abigail F Abaigeal (Irish)
229 Adam M Adamo (Italian)
231 Adam M Adan (Spanish)
232 Adam M Adao (Portuguese)
242 Adam M Ado (Estonian)
243 Adam M Adomas (Lithuanian)
268 Adelaide F Adalia (Spanish)
275 Adelaide F Adelajda (Polish)
277 Adelaide F Adelhaid (German)

In [7]:
df['parenthetical_info']=''

In [8]:
x = df.variant.ix[88]

In [9]:
x


Out[9]:
'Abelle (feminine of Abel)'

In [10]:
import re

In [14]:
print re.search(" \(.+\)", x).group(0)[2:-1]


feminine of Abel

In [15]:
print re.sub(" \(.+\)", '', x)


Abelle

In [16]:
for i in range(len(df)):
    var = df.variant.iloc[i]
    if re.search(" \(.+\)", var):
        df.variant.iloc[i] = re.sub(" \(.+\)", '', var)
        df.parenthetical_info[i] = re.search(" \(.+\)", var).group(0)[2:-1]

In [17]:
print df[df.parenthetical_info != ''].head(10)


         name sex   variant parenthetical_info
88     Abelia   F    Abelle   feminine of Abel
101   Abigail   F  Abaigeal              Irish
229      Adam   M     Adamo            Italian
231      Adam   M      Adan            Spanish
232      Adam   M      Adao         Portuguese
242      Adam   M       Ado           Estonian
243      Adam   M    Adomas         Lithuanian
268  Adelaide   F    Adalia            Spanish
275  Adelaide   F  Adelajda             Polish
277  Adelaide   F  Adelhaid             German

In [18]:
df.to_pickle('df_variants_ict.pickle')

In [ ]: