In [1]:
from __future__ import print_function
import unicodecsv as csv
import json
import re
import requests
import time
In [2]:
# this gets updated every day
data = requests.get("http://api.steampowered.com/ISteamApps/GetAppList/v0001/").json()
apps = data['applist']['apps']['app']
len(apps)
Out[2]:
In [3]:
last_appid_read = 0
for app in apps:
appid = app['appid']
last_appid_read += 1
if int(appid) == 460010:
break
last_appid_read
Out[3]:
In [4]:
apps[24464]
Out[4]:
In [5]:
apps[0]
Out[5]:
In [6]:
def clean_text(input_str):
input1 = re.sub('\r|\n|\r\n','',input_str)
input2 = re.sub('<[^>]+>','',input1)
input3 = re.sub('\s+',' ',input2)
input4 = input3.lower()
# no need for more preprocessing because sklearn's tfidfvectorizer will clean it up prior to
# featurizing
return input4
In [7]:
base_url = "http://store.steampowered.com/api/appdetails?appids="
apps_remaining = apps[last_appid_read:]
with open('../../data/steam/data.csv','a') as csvfile:
fieldnames = ['appid','type','name','detailed_description','about_the_game','categories','genres']
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
# writer.writeheader()
for app in apps_remaining:
appid = app['appid']
address = base_url+str(appid)
try:
data = requests.get(address).json()
except requests.ConnectionError:
print("failed to retrieve appid {0}, let's wait a minute then try again".format(appid))
# wait a bit then try again
time.sleep(60)
try:
data = requests.get(address).json()
except requests.ConnectionError:
print("failed to retrieve appid {0}, skipping".format(appid))
# but if if fails again give up
continue
success = data[str(appid)]['success']
if not (success == "True" or success == True):
print("no success for appid {0}, got success={1}".format(appid,success))
time.sleep(2)
continue
app_data = data[str(appid)]['data']
data_dict = dict()
data_dict['appid'] = appid
data_dict['name'] = app_data['name']
data_dict['type'] = app_data['type']
try:
data_dict['detailed_description'] = clean_text(app_data['detailed_description'])
except KeyError:
data_dict['detailed_description'] = ''
try:
data_dict['about_the_game'] = clean_text(app_data['about_the_game'])
except KeyError:
data_dict['about_the_game'] = ''
try:
data_dict['categories'] = ",".join(map(lambda d: d['description'],app_data['categories']))
except KeyError:
data_dict['categories'] = 'Uncategorized'
try:
data_dict['genres'] = ",".join(map(lambda d: d['description'],app_data['genres']))
except KeyError:
data_dict['genres'] = 'Uncategorized'
writer.writerow(data_dict)
time.sleep(3)
In [ ]:
In [ ]: