In [1]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
import csv
import os
import matplotlib.pyplot as plt
import pandas as pd
import sys
import matplotlib
import numpy as np
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams['font.family'] = 'monospace'
rcParams['font.sans-serif'] = ['lucida console']
%matplotlib inline
# Print Versions
print('Python version ' + sys.version)
print('Pandas version ' + pd.__version__)
print('Matplotlib version ' + matplotlib.__version__)
In [2]:
if not os.path.exists('./data'):
os.makedirs('./data')
In [5]:
with open('./data/dlc.csv', 'w') as csvfile:
csvwriter = csv.writer( csvfile)
html = urlopen('http://store.steampowered.com/app/221680/')
bsObj = BeautifulSoup(html, 'html.parser')
nameList = bsObj.findAll('div', {'class':'game_area_dlc_name'})
priceList = bsObj.findAll('div', {'class':'game_area_dlc_price'})
for name, price in zip(list(nameList), list(priceList)):
csvwriter.writerow([(name.get_text()).strip(), (price.get_text()).strip()])
pass
In [7]:
df = pd.read_csv('./data/dlc.csv', header=None)
In [8]:
df.shape
Out[8]:
In [9]:
pack = df[0].str.contains('Song Pack')
In [10]:
df[pack].to_csv('./data/pack.csv', encoding='utf-8')
In [11]:
df[df.columns[1:]] = df[df.columns[1:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.float64)
In [12]:
df.columns = ['Song / Song Pack', 'Price']
In [13]:
df.head(10)
Out[13]:
In [14]:
total = df['Price'].sum()
total = float("{0:.2f}".format(total))
print (total)
In [15]:
df2 = pd.read_csv("./data/pack.csv", header=None)
In [16]:
df2[df2.columns[2:]] = df2[df2.columns[2:]].apply(lambda x: x.str.replace('$','')).apply(lambda x: x.str.replace(',','')).astype(np.float64)
In [17]:
df2.shape
Out[17]:
In [18]:
df2.columns = ['List #', 'Song Pack', 'Price']
In [19]:
df2.head(10)
Out[19]:
In [20]:
df2 = df2[df2['Price'] != 1.00]
df2 = df2.drop('List #', 1)
In [21]:
df2.head(10)
Out[21]:
In [22]:
packs = df2['Price'].sum()
print('Packs:', packs)
In [23]:
songs = total - packs
print('Songs:', songs)
In [24]:
dict = {'Songs': [songs], 'Packs': [packs]}
df3 = pd.DataFrame(data = dict)
df3 = df3[['Songs', 'Packs']]
df3['Difference'] = df3['Songs'] - df3['Packs']
df3.head(1)
Out[24]:
In [25]:
df['Price'] = df['Price'].astype(np.float64)
df.plot.hist(alpha=0.5, title = 'Song Price Histogram', grid = True, xlim = (0, 40))
df.plot.kde(alpha=0.5, title = 'Song Price KDE', grid = True, xlim = (0, 40))
df2.plot.hist(alpha=0.5, title = 'Pack Price Histogram', grid = True, xlim = (0, 40))
df2.plot.kde(alpha=0.5, title = 'Pack Price KDE', grid = True, xlim = (0, 40))
Out[25]:
In [ ]: