In [1]:
import mechanize
from random import randrange
from bs4 import BeautifulSoup
import time
alphabet = 'abcdefghijklmnopqrstuvwxyz'
bigrams = []
for i in range(12,26):
for j in range(26):
bigrams.append(alphabet[i]+alphabet[j])
names = []
In [20]:
def extract(html, bigram):
global names
soup = BeautifulSoup(html)
trlist = []
for tr in soup.find_all('tr'):
if str(tr)[:18] == '<tr><td class=\"tan':
trlist.append(tr)
for item in trlist:
if str(item)[21:24].lower() == bigram:
names.append(item)
In [3]:
br = mechanize.Browser()
#br.set_all_readonly(False) # allow everything to be written to
br.set_handle_robots(False) # ignore robots
br.set_handle_refresh(False) # can sometimes hang without this
br.addheaders = [('User-agent', 'Chrome')]
In [4]:
#
problems = []
for bigram in bigrams:
response = br.open("http://incompetech.com/named/")
time.sleep(randrange(5)*5)
print bigram,
thispage = 1
br.form = list(br.forms())[0] # use when form is unnamed
namecontrol = br.form.find_control("name")
namecontrol.value = bigram
formresponse = br.submit()
time.sleep(randrange(5)*3)
html = formresponse.read()
#extract(html, bigram)
if html.find("You can narrow your search in the following ways") > -1:
problems.append(bigram)
else:
# stopit = False
# while stopit == False:
# thispage += 1
# linknames = []
# for link in br.links():
# linknames.append(link.text)
# if str(thispage) in linknames:
# for link in br.links():
# if link.text == str(thispage):
# request = br.click_link(link)
# response = br.follow_link(link)
# html = response.read()
# extract(html, bigram)
# else:
# stopit = True
pass
br.back()
print problems
In [11]:
soup = BeautifulSoup(html)
trlist = []
for tr in soup.find_all('tr'):
if str(tr)[:18] == '<tr><td class=\"tan':
trlist.append(tr)
# for item in trlist:
# if str(item)[21:24].lower() == bigram:
# names.append(item)
In [19]:
str(soup.find_all('tr')[10])[:18]
Out[19]:
In [5]:
print problems
toredo = ['al', 'an', 'ar', 'el', 'en', 'ha', 'ia', 'in', 'la', 'ma', 'na', 'on', 'ra']
trigrams = []
for problem in toredo:
for ltr in alphabet:
trigrams.append(problem+ltr)
In [21]:
moreproblems = []
names = []
for trigram in trigrams:
response = br.open("http://incompetech.com/named/")
print trigram,
time.sleep(randrange(5)*3+1)
thispage = 1
br.form = list(br.forms())[0] # use when form is unnamed
namecontrol = br.form.find_control("name")
namecontrol.value = trigram
formresponse = br.submit()
html = formresponse.read()
extract(html, trigram)
if html.find("You can narrow your search in the following ways") > -1:
moreproblems.append(trigram)
else:
stopit = False
while stopit == False:
thispage += 1
linknames = []
for link in br.links():
linknames.append(link.text)
if str(thispage) in linknames:
for link in br.links():
if link.text == str(thispage):
request = br.click_link(link)
response = br.follow_link(link)
html = response.read()
extract(html, trigram)
else:
stopit = True
time.sleep(randrange(4)*2+1)
#br.back()
print " "
print moreproblems
with open('incompetechtri.txt', 'w+') as f:
for name in names:
f.write(str(name)+'\n')
In [14]:
with open('incompetech.txt', 'w+') as f:
for name in names:
f.write(str(name)+'\n')
In [1]:
import pandas as pd
df_meanings = pd.DataFrame()
df_variants = pd.DataFrame()
error_catcher = []
with open('incompetechall.txt', 'r+') as f:
for line in f.readlines():
try:
ls = line.strip().split('\t')
name = ls[0]
sex = ls[2]
if sex == 'Male':
sex = 'M'
if sex == 'Female':
sex = 'F'
df_meanings = df_meanings.append(pd.DataFrame({'name':[name], 'origin':[ls[1]], 'sex':[sex], 'meaning':[ls[3]]}), ignore_index=True)
if len(ls)>4:
vars = ls[4].split(', ')
for var in vars:
df_variants = df_variants.append(pd.DataFrame({'name':[name], 'sex':[sex], 'variant':[var]}), ignore_index=True)
except:
error_catcher.append(line)
print error_catcher
df_meanings.reset_index(drop=True, inplace=True)
df_variants.reset_index(drop=True, inplace=True)
df_meanings.to_pickle('df_meaning.pickle')
df_variants.to_pickle('df_variants.pickle')
In [1]:
import pandas as pd
In [11]:
df = pd.read_pickle('df_meaning.pickle')
In [7]:
In [26]:
df2 = pd.DataFrame()
for idx, row in df.iterrows():
if df.sex[idx] == 'Male/Female' or df.sex[idx] == 'Both':
n = df.name[idx]
o = df.origin[idx]
m = df.meaning[idx]
df.sex[idx] = 'M'
df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'origin':[o],'meaning':[m]}), ignore_index = True)
df.append(df2, ignore_index=True)
df.reset_index(drop=True,inplace=True)
In [27]:
df2 = df[(df.sex != 'F') & (df.sex != 'M')]
df2
Out[27]:
In [38]:
dfv = pd.read_pickle('df_variants.pickle')
In [33]:
df.meaning.ix[5042] = 'From the church land'
df.origin.ix[5042] = 'Old-English'
df.sex.ix[5042] = 'M'
df.sex.ix[6454] = 'M'
df.meaning.ix[7016] = 'Mighty, strong'
df.sex.ix[7016] = 'M'
df.meaning.ix[8955] = 'Presenter, giver'
df.sex.ix[8955] = 'M'
df.meaning.ix[9675] = 'Most beautiful'
df.origin.ix[9675] = 'African---Nigerian'
df.sex.ix[9675] = 'F'
dfv = dfv[dfv.name != 'Alika']
df.meaning.ix[9942] = 'Eagle cauldron'
df.sex.ix[9942] = 'M'
df.meaning.ix[10071] = 'The Lord is my God'
df.sex.ix[10071] = 'M'
df.sex.ix[10487] = 'M'
df.meaning.ix[10815] = 'King'
df.origin.ix[10815] = 'African---Swahili'
df.sex.ix[10815] = 'M'
dfv = dfv[dfv.name != 'Mansa']
df.meaning.ix[10845] = 'Moon'
df.origin.ix[10845] = 'Polynesian'
df.sex.ix[10845] = 'M'
dfv = dfv[dfv.name != 'Marama']
df.meaning.ix[10943] = 'Twin'
In [40]:
dv = {'name': ['Kirkland', 'Kirkland', 'Rhett', 'Wahhab', 'Wahhab', 'Arkell', 'Ellis', 'Massey', 'Narelle', 'Narelle'],
'variant': ['Kerklan', 'Kirklan', 'Rhet', 'Wahad', 'Wahib', 'Arkel', 'Elis', 'Massie', 'Nerela', 'Narele'],
'sex': ['M', 'M', 'M', 'M', 'M', 'M', 'M', 'M', 'F', 'F']}
In [41]:
dfv=dfv.append(pd.DataFrame(dv))
In [42]:
dfv.tail(50)
Out[42]:
In [44]:
df2 = pd.DataFrame()
dfv.reset_index(drop=True, inplace=True)
for idx, row in dfv.iterrows():
if dfv.sex[idx] == 'Male/Female' or dfv.sex[idx] == 'Both':
n = dfv.name[idx]
dfv.sex[idx] = 'M'
v = dfv.variant[idx]
df2 = df2.append(pd.DataFrame({'name':[n],'sex':['F'],'variant':[v]}), ignore_index = True)
dfv.append(df2, ignore_index=True)
dfv.reset_index(drop=True,inplace=True)
In [45]:
dfv.tail()
Out[45]:
In [46]:
dfv.sex.unique()
Out[46]:
In [2]:
df.to_pickle('df_meaning_incompetech.pickle')
dfv.to_pickle('df_variants_incompetech.pickle')
In [4]:
import pandas as pd
df = pd.read_pickle('df_variants_ict.pickle')
In [6]:
df[df.variant.str.contains("\(")].head(10)
Out[6]:
In [7]:
df['parenthetical_info']=''
In [8]:
x = df.variant.ix[88]
In [9]:
x
Out[9]:
In [10]:
import re
In [14]:
print re.search(" \(.+\)", x).group(0)[2:-1]
In [15]:
print re.sub(" \(.+\)", '', x)
In [16]:
for i in range(len(df)):
var = df.variant.iloc[i]
if re.search(" \(.+\)", var):
df.variant.iloc[i] = re.sub(" \(.+\)", '', var)
df.parenthetical_info[i] = re.search(" \(.+\)", var).group(0)[2:-1]
In [17]:
print df[df.parenthetical_info != ''].head(10)
In [18]:
df.to_pickle('df_variants_ict.pickle')
In [ ]: