In [69]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import requests
import BeautifulSoup as soup
import os
import time
In [2]:
url = 'http://www.chessgames.com/directory/'
fname = 'chess.htm'
In [96]:
start = 65
raw_pages = []
for i in range(26):
letter = chr(start + i)
page = url + letter + '.html'
fname = letter + '.html'
if not(os.path.isfile(fname)):
r = requests.get(page)
time.sleep(1)
print letter, r
f = open(fname, 'w')
f.write(r.text.encode('ascii', 'ignore'))
r.close()
#
f = open(fname, 'r')
lines = f.readlines()
f.close()
raw_pages.append(' '.join(lines))
In [97]:
dfs = []
for body in raw_pages:
b = soup.BeautifulSoup(body)
rows = b.findAll('tr')
prows = []
for row in rows:
cells = len(row.findAll('td'))
if cells == 5:
prows.append(row)
ratings = []
names = []
yearss = []
gamess = []
for prow in prows:
cells = prow.findAll('td')
try:
rating = int(cells[0].text.replace(' ', ''))
except ValueError:
rating = -1
name = cells[2].text.replace(' ', '')
years = cells[3].text.replace(' ', '')
games = cells[4].text.replace(' ', '')
ratings.append(rating)
names.append(name)
yearss.append(years)
gamess.append(games)
df = pd.DataFrame({'rating': ratings, 'name': names, 'years': yearss, 'games': gamess})
dfs.append(df)
In [100]:
df = dfs[0]
for i in range(2, len(dfs)):
df2 = dfs[i]
df = df.append(df2)
df.shape
Out[100]:
In [101]:
df.sort('rating', inplace=True)
df.head()
Out[101]:
In [102]:
plt.figure(figsize=(15,8))
plt.plot(df['rating'])
plt.ylim([1500, 3000])
plt.show()
In [103]:
df['games'] = df['games'].apply(lambda x: int(x.replace(',', '')))
In [104]:
plt.figure(figsize=(10,10))
plt.scatter(df['games'], df['rating'], alpha=0.2)
plt.xlabel('game')
plt.ylabel('rating')
plt.ylim([1500, 3000])
plt.xlim([0, 500])
plt.show()
In [105]:
df.to_csv('chessdata_full.csv', sep='\t', index=False)
In [106]:
df['years_low'] = df['years'].apply(lambda x: x.split('-')[0])
def cleanit(x):
arr = x.split('-')
if len(arr)==1:
return x
else:
return arr[1]
df['years_high'] = df['years'].apply(cleanit)
In [107]:
df['active'] = df['years_high'].astype(int) - df['years_low'].astype(int)
In [109]:
plt.figure(figsize=(10,10))
plt.scatter(df['active'], df['rating'], alpha=0.2)
plt.xlabel('active')
plt.ylabel('rating')
plt.xlim([0, 100])
plt.ylim([1500, 3000])
plt.show()
In [110]:
!ls