notebook.community

Edit and run



In [1]:

    
import mechanize
from random import randrange
from bs4 import BeautifulSoup
import time
alphabet = 'abcdefghijklmnopqrstuvwxyz'
bigrams = []
for i in range(12,26):
   for j in range(26):
           bigrams.append(alphabet[i]+alphabet[j])
            
names = []



In [20]:

    
def extract(html, bigram):
      global names
      soup = BeautifulSoup(html)
      trlist = []
      for tr in soup.find_all('tr'):
          if str(tr)[:18] == '<tr><td class=\"tan':
              trlist.append(tr)
      for item in trlist:
          if str(item)[21:24].lower() == bigram:
              names.append(item)



In [3]:

    
br = mechanize.Browser()
#br.set_all_readonly(False)    # allow everything to be written to
br.set_handle_robots(False)   # ignore robots
br.set_handle_refresh(False)  # can sometimes hang without this
br.addheaders = [('User-agent', 'Chrome')]



In [4]:

    
#

problems = []

for bigram in bigrams:
    response = br.open("http://incompetech.com/named/")
    time.sleep(randrange(5)*5)
    print bigram,
    thispage = 1
    br.form = list(br.forms())[0]  # use when form is unnamed
    namecontrol = br.form.find_control("name")
    namecontrol.value = bigram
    formresponse = br.submit()
    time.sleep(randrange(5)*3)
    html = formresponse.read()
    #extract(html, bigram)
    if html.find("You can narrow your search in the following ways") > -1:
        problems.append(bigram)
    else:
#         stopit = False
#         while stopit == False:
#             thispage += 1
#             linknames = []
#             for link in br.links():
#                 linknames.append(link.text)
#             if str(thispage) in linknames:
#                 for link in br.links():
#                     if link.text == str(thispage):
#                         request = br.click_link(link)
#                         response = br.follow_link(link)
#                         html = response.read()
#                         extract(html, bigram)
#             else:
#                 stopit = True
                
        pass
    br.back()
    
print problems









    



ma mb mc md me mf mg mh mi mj mk ml mm mn mo mp mq mr ms mt mu mv mw mx my mz na nb nc nd ne nf ng nh ni nj nk nl nm nn no np nq nr ns nt nu nv nw nx ny nz oa ob oc od oe of og oh oi oj ok ol om on oo op oq or os ot ou ov ow ox oy oz pa pb pc pd pe pf pg ph pi pj pk pl pm pn po pp pq pr ps pt pu pv pw px py pz qa qb qc qd qe qf qg qh qi qj qk ql qm qn qo qp qq qr qs qt qu qv qw qx qy qz ra rb rc rd re rf rg rh ri rj rk rl rm rn ro rp rq rr rs rt ru rv rw rx ry rz sa sb sc sd se sf sg sh si sj sk sl sm sn so sp sq sr ss st su sv sw sx sy sz ta tb tc td te tf tg th ti tj tk tl tm tn to tp tq tr ts tt tu tv tw tx ty tz ua ub uc ud ue uf ug uh ui uj uk ul um un uo up uq ur us ut uu uv uw ux uy uz va vb vc vd ve vf vg vh vi vj vk vl vm vn vo vp vq vr vs vt vu vv vw vx vy vz wa wb wc wd we wf wg wh wi wj wk wl wm wn wo wp wq wr ws wt wu wv ww wx wy wz xa xb xc xd xe xf xg xh xi xj xk xl xm xn xo xp xq xr xs xt xu xv xw xx xy xz ya yb yc yd ye yf yg yh yi yj yk yl ym yn yo yp yq yr ys yt yu yv yw yx yy yz za zb zc zd ze zf zg zh zi zj zk zl zm zn zo zp zq zr zs zt zu zv zw zx zy zz ['ma', 'na', 'on', 'ra']



In [11]:

    
soup = BeautifulSoup(html)
trlist = []
for tr in soup.find_all('tr'):
  if str(tr)[:18] == '<tr><td class=\"tan':
      trlist.append(tr)
# for item in trlist:
#   if str(item)[21:24].lower() == bigram:
#       names.append(item)



In [19]:

    
str(soup.find_all('tr')[10])[:18]









    Out[19]:





'<tr><td class="tan'



In [5]:

    
print problems
toredo = ['al', 'an', 'ar', 'el', 'en', 'ha', 'ia', 'in', 'la', 'ma', 'na', 'on', 'ra']
trigrams = []
for problem in toredo:
    for ltr in alphabet:
        trigrams.append(problem+ltr)









    



['ma', 'na', 'on', 'ra']



In [21]:

    
moreproblems = []
names = []

for trigram in trigrams:
    response = br.open("http://incompetech.com/named/")
    print trigram,
    time.sleep(randrange(5)*3+1)
    thispage = 1
    br.form = list(br.forms())[0]  # use when form is unnamed
    namecontrol = br.form.find_control("name")
    namecontrol.value = trigram
    formresponse = br.submit()
    html = formresponse.read()
    extract(html, trigram)
    if html.find("You can narrow your search in the following ways") > -1:
        moreproblems.append(trigram)
    else:
        stopit = False
        while stopit == False:
            thispage += 1
            linknames = []
            for link in br.links():
                linknames.append(link.text)
            if str(thispage) in linknames:
                for link in br.links():
                    if link.text == str(thispage):
                        request = br.click_link(link)
                        response = br.follow_link(link)
                        html = response.read()
                        extract(html, trigram)
            else:
                stopit = True
                
        time.sleep(randrange(4)*2+1)
    #br.back()
print " "
print moreproblems
with open('incompetechtri.txt', 'w+') as f:
  for name in names:
        f.write(str(name)+'\n')









    



ala alb alc ald ale alf alg alh ali alj alk all alm aln alo alp alq alr als alt alu alv alw alx aly alz ana anb anc and ane anf ang anh ani anj ank anl anm ann ano anp anq anr ans ant anu anv anw anx any anz ara arb arc ard are arf arg arh ari arj ark arl arm arn aro arp arq arr ars art aru arv arw arx ary arz ela elb elc eld ele elf elg elh eli elj elk ell elm eln elo elp elq elr els elt elu elv elw elx ely elz ena enb enc end ene enf eng enh eni enj enk enl enm enn eno enp enq enr ens ent enu env enw enx eny enz haa hab hac had hae haf hag hah hai haj hak hal ham han hao hap haq har has hat hau hav haw hax hay haz iaa iab iac iad iae iaf iag iah iai iaj iak ial iam ian iao iap iaq iar ias iat iau iav iaw iax iay iaz ina inb inc ind ine inf ing inh ini inj ink inl inm inn ino inp inq inr ins int inu inv inw inx iny inz laa lab lac lad lae laf lag lah lai laj lak lal lam lan lao lap laq lar las lat lau lav law lax lay laz maa mab mac mad mae maf mag mah mai maj mak mal mam man mao map maq mar mas mat mau mav maw max may maz naa nab nac nad nae naf nag nah nai naj nak nal nam nan nao nap naq nar nas nat nau nav naw nax nay naz ona onb onc ond one onf ong onh oni onj onk onl onm onn ono onp onq onr ons ont onu onv onw onx ony onz raa rab rac rad rae raf rag rah rai raj rak ral ram ran rao rap raq rar ras rat rau rav raw rax ray raz  
[]



In [14]:

    
with open('incompetech.txt', 'w+') as f:
  for name in names:
        f.write(str(name)+'\n')



In [1]:

    
import pandas as pd
df_meanings = pd.DataFrame()
df_variants = pd.DataFrame()
error_catcher = []

with open('incompetechall.txt', 'r+') as f:
    for line in f.readlines():
        try:
            ls = line.strip().split('\t')
            name = ls[0]
            sex = ls[2]
            if sex == 'Male':
                sex = 'M'
            if sex == 'Female':
                sex = 'F'
            df_meanings = df_meanings.append(pd.DataFrame({'name':[name], 'origin':[ls[1]], 'sex':[sex], 'meaning':[ls[3]]}), ignore_index=True)
            if len(ls)>4:
                vars = ls[4].split(', ')
                for var in vars:
                    df_variants = df_variants.append(pd.DataFrame({'name':[name], 'sex':[sex], 'variant':[var]}), ignore_index=True)
        
        except:
            error_catcher.append(line)
        
                
                
print error_catcher

df_meanings.reset_index(drop=True, inplace=True)
df_variants.reset_index(drop=True, inplace=True)

df_meanings.to_pickle('df_meaning.pickle')
df_variants.to_pickle('df_variants.pickle')









    



['Kafu\tTongan\tCovering\n', 'Kawana\tAustralian-Aboriginal\tFemale\n', 'Ligongo\tAfrican-Male\tWho is he?\n', 'Trutta\tAustralian-Aboriginal\tFemale\n', 'Wanikiya\tNAm.-Indian---Lakota\tMale\t\n', 'Lakkari\tAustralian-Aboriginal\tHoneysuckle tree\n']



In [1]:

    
import pandas as pd



In [11]:

    
df = pd.read_pickle('df_meaning.pickle')



In [7]:



In [26]:

    
df2 = pd.DataFrame()
for idx, row in df.iterrows():
    if df.sex[idx] == 'Male/Female' or df.sex[idx] == 'Both':
        n = df.name[idx]
        o = df.origin[idx]
        m = df.meaning[idx]
        df.sex[idx] = 'M'
        df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'origin':[o],'meaning':[m]}), ignore_index = True)

df.append(df2, ignore_index=True)
df.reset_index(drop=True,inplace=True)



In [27]:

    
df2 = df[(df.sex != 'F') & (df.sex != 'M')]
df2









    Out[27]:






  
    
      
      meaning
      name
      origin
      sex
    
  
  
    
      5042 
       Kerklan, Kirklan
       Kirkland
            Old-English-Male
       From the church land
    
    
      6454 
          God will hear
         Osahar
                     African
                       Male
    
    
      7016 
                   Rhet
          Rhett
                 Old-English
             Mighty, strong
    
    
      8955 
           Wahad, Wahib
         Wahhab
                      Arabic
           Presenter, giver
    
    
      9675 
                 Female
          Alika
                     African
                 - Nigerian
    
    
      9942 
                  Arkel
         Arkell
                   Old-Norse
             Eagle cauldron
    
    
      10071
                   Elis
          Ellis
                      Hebrew
         The Lord is my God
    
    
      10487
                  Lemon
         Lamani
                      Tongan
                       Male
    
    
      10815
                   Male
          Mansa
                     African
                  - Swahili
    
    
      10845
                   Male
         Marama
                            
                 Polynesian
    
    
      10943
                 Massie
         Massey
                 Old-English
                       Twin
    
    
      11051
                Lookout
           Nain
       Australian-Aboriginal
                       Male
    
    
      11123
         Nerela, Narele
        Narelle
       Australian-Aboriginal
         Woman from the sea



In [38]:

    
dfv = pd.read_pickle('df_variants.pickle')



In [33]:

    
df.meaning.ix[5042] = 'From the church land'
df.origin.ix[5042] = 'Old-English'
df.sex.ix[5042] = 'M'
df.sex.ix[6454] = 'M'
df.meaning.ix[7016] = 'Mighty, strong'
df.sex.ix[7016] = 'M'
df.meaning.ix[8955] = 'Presenter, giver'
df.sex.ix[8955] = 'M'
df.meaning.ix[9675] = 'Most beautiful'
df.origin.ix[9675] = 'African---Nigerian'
df.sex.ix[9675] = 'F'
dfv = dfv[dfv.name != 'Alika']
df.meaning.ix[9942] = 'Eagle cauldron'
df.sex.ix[9942] = 'M'

df.meaning.ix[10071] = 'The Lord is my God'
df.sex.ix[10071] = 'M'

df.sex.ix[10487] = 'M'

df.meaning.ix[10815] = 'King'
df.origin.ix[10815] = 'African---Swahili'
df.sex.ix[10815] = 'M'
dfv = dfv[dfv.name != 'Mansa']

df.meaning.ix[10845] = 'Moon'
df.origin.ix[10845] = 'Polynesian'
df.sex.ix[10845] = 'M'
dfv = dfv[dfv.name != 'Marama']

df.meaning.ix[10943] = 'Twin'



In [40]:

    
dv = {'name': ['Kirkland', 'Kirkland', 'Rhett', 'Wahhab', 'Wahhab', 'Arkell', 'Ellis', 'Massey', 'Narelle', 'Narelle'],
      'variant': ['Kerklan', 'Kirklan', 'Rhet', 'Wahad', 'Wahib', 'Arkel', 'Elis', 'Massie', 'Nerela', 'Narele'],
	  'sex': ['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F']}



In [41]:

    
dfv=dfv.append(pd.DataFrame(dv))



In [42]:

    
dfv.tail(50)









    Out[42]:






  
    
      
      name
      sex
      variant
    
  
  
    
      16444
        Raymond
       M
                 Ramon
    
    
      16445
        Raymond
       M
                Ramond
    
    
      16446
        Raymond
       M
               Ramonde
    
    
      16447
        Raymond
       M
                Ramone
    
    
      16448
        Raymond
       M
                   Ray
    
    
      16449
        Raymond
       M
               Rayment
    
    
      16450
        Raymond
       M
       Reamonn (Irish)
    
    
      16451
        Raymond
       M
               Redmond
    
    
      16452
        Raymond
       M
               Reymond
    
    
      16453
        Raymond
       M
              Reymondo
    
    
      16454
       Raymonde
       F
               Raemond
    
    
      16455
       Raymonde
       F
              Raemonda
    
    
      16456
       Raymonde
       F
               Raimond
    
    
      16457
       Raymonde
       F
              Raimonda
    
    
      16458
       Raymonde
       F
              Raimonde
    
    
      16459
       Raymonde
       F
                 Rayma
    
    
      16460
       Raymonde
       F
              Raymonda
    
    
      16461
          Rayna
       F
                  Raen
    
    
      16462
          Rayna
       F
                 Raena
    
    
      16463
          Rayna
       F
                  Rain
    
    
      16464
          Rayna
       F
                 Raina
    
    
      16465
          Rayna
       F
                 Raine
    
    
      16466
          Rayna
       F
                 Reina
    
    
      16467
          Rayna
       F
                 Reyna
    
    
      16468
         Raynor
       M
                Ragnar
    
    
      16469
         Raynor
       M
                Rainar
    
    
      16470
         Raynor
       M
                Rainer
    
    
      16471
         Raynor
       M
               Rainier
    
    
      16472
         Raynor
       M
                Rainor
    
    
      16473
         Raynor
       M
               Ranieri
    
    
      16474
         Raynor
       M
                Raynar
    
    
      16475
         Raynor
       M
                Rayner
    
    
      16476
           Razi
       F
               Razilee
    
    
      16477
           Razi
       F
                Razili
    
    
      16478
         Raziah
       F
                 Razia
    
    
      16479
         Raziah
       F
               Raziela
    
    
      16480
         Raziah
       F
               Razilee
    
    
      16481
         Raziah
       F
                Razili
    
    
      16482
         Raziah
       F
                Raziya
    
    
      16483
         Razzaq
       M
                 Razaq
    
    
      0    
       Kirkland
       M
               Kerklan
    
    
      1    
       Kirkland
       M
               Kirklan
    
    
      2    
          Rhett
       M
                  Rhet
    
    
      3    
         Wahhab
       M
                 Wahad
    
    
      4    
         Wahhab
       M
                 Wahib
    
    
      5    
         Arkell
       M
                 Arkel
    
    
      6    
          Ellis
       M
                  Elis
    
    
      7    
         Massey
       M
                Massie
    
    
      8    
        Narelle
       F
                Nerela
    
    
      9    
        Narelle
       F
                Narele



In [44]:

    
df2 = pd.DataFrame()
dfv.reset_index(drop=True, inplace=True)
for idx, row in dfv.iterrows():
    if dfv.sex[idx] == 'Male/Female' or dfv.sex[idx] == 'Both':
        n = dfv.name[idx]
        dfv.sex[idx] = 'M'
        v = dfv.variant[idx]
        df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'variant':[v]}), ignore_index = True)

dfv.append(df2, ignore_index=True)
dfv.reset_index(drop=True,inplace=True)



In [45]:

    
dfv.tail()









    Out[45]:






  
    
      
      name
      sex
      variant
    
  
  
    
      16489
        Arkell
       M
        Arkel
    
    
      16490
         Ellis
       M
         Elis
    
    
      16491
        Massey
       M
       Massie
    
    
      16492
       Narelle
       F
       Nerela
    
    
      16493
       Narelle
       F
       Narele



In [46]:

    
dfv.sex.unique()









    Out[46]:





array(['F', 'M', '- Nigerian', ' Male', '- Swahili', 'Polynesian'], dtype=object)



In [2]:

    
df.to_pickle('df_meaning_incompetech.pickle')
dfv.to_pickle('df_variants_incompetech.pickle')









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-a18cc88d4b04> in <module>()
----> 1 df.to_pickle('df_meaning_incompetech.pickle')
      2 dfv.to_pickle('df_variants_ict.pickle')

NameError: name 'df' is not defined

Cleanup variants - startpoint



In [4]:

    
import pandas as pd
df = pd.read_pickle('df_variants_ict.pickle')



In [6]:

    
df[df.variant.str.contains("\(")].head(10)









    Out[6]:






  
    
      
      name
      sex
      variant
    
  
  
    
      88 
         Abelia
       F
       Abelle (feminine of Abel)
    
    
      101
        Abigail
       F
                Abaigeal (Irish)
    
    
      229
           Adam
       M
                 Adamo (Italian)
    
    
      231
           Adam
       M
                  Adan (Spanish)
    
    
      232
           Adam
       M
               Adao (Portuguese)
    
    
      242
           Adam
       M
                  Ado (Estonian)
    
    
      243
           Adam
       M
             Adomas (Lithuanian)
    
    
      268
       Adelaide
       F
                Adalia (Spanish)
    
    
      275
       Adelaide
       F
               Adelajda (Polish)
    
    
      277
       Adelaide
       F
               Adelhaid (German)



In [7]:

    
df['parenthetical_info']=''



In [8]:

    
x = df.variant.ix[88]



In [9]:

    
x









    Out[9]:





'Abelle (feminine of Abel)'



In [10]:

    
import re



In [14]:

    
print re.search(" \(.+\)", x).group(0)[2:-1]









    



feminine of Abel



In [15]:

    
print re.sub(" \(.+\)", '', x)









    



Abelle



In [16]:

    
for i in range(len(df)):
    var = df.variant.iloc[i]
    if re.search(" \(.+\)", var):
        df.variant.iloc[i] = re.sub(" \(.+\)", '', var)
        df.parenthetical_info[i] = re.search(" \(.+\)", var).group(0)[2:-1]



In [17]:

    
print df[df.parenthetical_info != ''].head(10)









    



         name sex   variant parenthetical_info
88     Abelia   F    Abelle   feminine of Abel
101   Abigail   F  Abaigeal              Irish
229      Adam   M     Adamo            Italian
231      Adam   M      Adan            Spanish
232      Adam   M      Adao         Portuguese
242      Adam   M       Ado           Estonian
243      Adam   M    Adomas         Lithuanian
268  Adelaide   F    Adalia            Spanish
275  Adelaide   F  Adelajda             Polish
277  Adelaide   F  Adelhaid             German



In [18]:

    
df.to_pickle('df_variants_ict.pickle')



In [ ]:

	meaning	name	origin	sex
5042	Kerklan, Kirklan	Kirkland	Old-English-Male	From the church land
6454	God will hear	Osahar	African	Male
7016	Rhet	Rhett	Old-English	Mighty, strong
8955	Wahad, Wahib	Wahhab	Arabic	Presenter, giver
9675	Female	Alika	African	- Nigerian
9942	Arkel	Arkell	Old-Norse	Eagle cauldron
10071	Elis	Ellis	Hebrew	The Lord is my God
10487	Lemon	Lamani	Tongan	Male
10815	Male	Mansa	African	- Swahili
10845	Male	Marama		Polynesian
10943	Massie	Massey	Old-English	Twin
11051	Lookout	Nain	Australian-Aboriginal	Male
11123	Nerela, Narele	Narelle	Australian-Aboriginal	Woman from the sea

	name	sex	variant
16444	Raymond	M	Ramon
16445	Raymond	M	Ramond
16446	Raymond	M	Ramonde
16447	Raymond	M	Ramone
16448	Raymond	M	Ray
16449	Raymond	M	Rayment
16450	Raymond	M	Reamonn (Irish)
16451	Raymond	M	Redmond
16452	Raymond	M	Reymond
16453	Raymond	M	Reymondo
16454	Raymonde	F	Raemond
16455	Raymonde	F	Raemonda
16456	Raymonde	F	Raimond
16457	Raymonde	F	Raimonda
16458	Raymonde	F	Raimonde
16459	Raymonde	F	Rayma
16460	Raymonde	F	Raymonda
16461	Rayna	F	Raen
16462	Rayna	F	Raena
16463	Rayna	F	Rain
16464	Rayna	F	Raina
16465	Rayna	F	Raine
16466	Rayna	F	Reina
16467	Rayna	F	Reyna
16468	Raynor	M	Ragnar
16469	Raynor	M	Rainar
16470	Raynor	M	Rainer
16471	Raynor	M	Rainier
16472	Raynor	M	Rainor
16473	Raynor	M	Ranieri
16474	Raynor	M	Raynar
16475	Raynor	M	Rayner
16476	Razi	F	Razilee
16477	Razi	F	Razili
16478	Raziah	F	Razia
16479	Raziah	F	Raziela
16480	Raziah	F	Razilee
16481	Raziah	F	Razili
16482	Raziah	F	Raziya
16483	Razzaq	M	Razaq
0	Kirkland	M	Kerklan
1	Kirkland	M	Kirklan
2	Rhett	M	Rhet
3	Wahhab	M	Wahad
4	Wahhab	M	Wahib
5	Arkell	M	Arkel
6	Ellis	M	Elis
7	Massey	M	Massie
8	Narelle	F	Nerela
9	Narelle	F	Narele

	name	sex	variant
16489	Arkell	M	Arkel
16490	Ellis	M	Elis
16491	Massey	M	Massie
16492	Narelle	F	Nerela
16493	Narelle	F	Narele

	name	sex	variant
88	Abelia	F	Abelle (feminine of Abel)
101	Abigail	F	Abaigeal (Irish)
229	Adam	M	Adamo (Italian)
231	Adam	M	Adan (Spanish)
232	Adam	M	Adao (Portuguese)
242	Adam	M	Ado (Estonian)
243	Adam	M	Adomas (Lithuanian)
268	Adelaide	F	Adalia (Spanish)
275	Adelaide	F	Adelajda (Polish)
277	Adelaide	F	Adelhaid (German)