In [ ]:
# TEST!
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
test = {}
for app in tree.iter('app'):
try:
if app.xpath('.//number_of_players')[0].text != None:
if app.get('scraped') != '1':
test[app.get('id')] = int(app.xpath('.//number_of_players')[0].text)
except:
pass
sorted(test.items(), key=lambda x: x[1], reverse=True)
In [ ]:
#tags test. i'm worried that there will be many cases of almost identical tags.
#but apparently that's not the case
#cartoon - cartoony
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
tags = {}
for tag in tree.iter('tag'):
if tag.text in list(tags.keys()):
tags[tag.text] += 1
else:
tags[tag.text] = 1
sorted(tags.items(), key=lambda x: x)
In [ ]:
# initialize from http://api.steampowered.com/ISteamApps/GetAppList/v0001/
# WARNING: THIS WILL WIPE ALL DATA
import requests
import json
import sys
from lxml import etree
request = 'http://api.steampowered.com/ISteamApps/GetAppList/v0001/'
response = requests.get(request)
if len(response.text)<1000:
sys.exit("something wrong with request")
json_response = json.loads(response.text)
root = etree.Element("root")
for r_app in json_response['applist']['apps']['app']:
app = etree.SubElement(root, 'app')
app.set('id',str(r_app['appid']))
name = etree.SubElement(app, 'name')
name.text = str(r_app['name'])
tree = etree.ElementTree(root)
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]:
# append data from _appids_scrapped files:
# it's a file of api responses
# requests = appids found in users libraries
import json
import datetime
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_appids_scraped.json", 'r') as f:
json_data = json.loads(f.read())
for app in json_data:
requested_app = None
for xml_app in tree.iter('app'):
if xml_app.get('id') == app['appid']:
requested_app = xml_app
break
if requested_app == None:
print(app, 'not in xml')
continue
requested_app.set('scraped', '1')
for key in list(app.keys()):
if key == 'requested_appid': # id as it was requested by scraper, not as was given in response
if app['requested_appid'] != app['appid']:
query = requested_app.xpath(".//redirected_from")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'redirected_from')
else:
sub = query[0]
red_id = etree.SubElement(sub, 'redirected_from')
red_id.text = app['requested_appid']
# then flag the original:
app_to_flag = None
for xml_app2 in tree.iter('app'):
if xml_app2.get('id') == app['requested_appid']:
app_to_flag = xml_app2
break
if app_to_flag == None:
print(app['requested_appid'], 'not in xml')
continue
app_to_flag.set('redirects_to_id',app['requested_appid'])
elif key == 'title': # title as scraped
#name from json?
if requested_app.xpath(".//name")[0].text != app['title']:
query = requested_app.xpath(".//name_scraped")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'name_scraped')
else:
sub = query[0]
sub.text = str(app['title'])
#print("names don't match! found in xml:",requested_app.xpath(".//name")[0].text,'found in json:', app['title'])
elif key == 'release_date': # it's a list
query = requested_app.xpath(".//release_date")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'release_date')
else:
sub = query[0]
try:
date = app['release_date'][0]
# let's reformat date to YYYY-MM-DD format
try:
date_formatted = datetime.datetime.strptime(date, "%d %b, %Y").strftime("%Y-%m-%d")
except ValueError:
try:
date_formatted = datetime.datetime.strptime(date, "%b %Y").strftime("%Y-%m-%d")
except ValueError: #wrong date format
date_formatted = None
except IndexError: #sometimes there's no date
date_formatted = None
sub.text = date_formatted
elif key == 'price': # price in EUR, it's a list
query = requested_app.xpath(".//price")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'price')
else:
sub = query[0]
try:
price = app['price'][0]
except IndexError: # sometimes there's no price
price = None
sub.text = price
if price != None:
price = price.replace(',','.')
if price != '0.00':
sub.set('currency', app['price_currency'][0])
elif key == 'developer': # may contain several developers, it's the same for and publishers and tags
query = requested_app.xpath(".//developers")
if len(query) != 0:
sub = query[0]
requested_app.remove(sub)
sub = etree.SubElement(requested_app, 'developers')
for dev_name in app['developer']:
dev_name = dev_name.title()
dev_sub = etree.SubElement(sub, 'developer')
dev_sub.text = dev_name
elif key == 'publisher':
query = requested_app.xpath(".//publishers")
if len(query) != 0:
sub = query[0]
requested_app.remove(sub)
sub = etree.SubElement(requested_app, 'publishers')
for pub_name in app['publisher']:
pub_name = pub_name.title()
pub_sub = etree.SubElement(sub, 'publisher')
pub_sub.text = pub_name
elif key == 'tags':
query = requested_app.xpath(".//tags")
if len(query) != 0:
sub = query[0]
requested_app.remove(sub)
sub = etree.SubElement(requested_app, 'tags')
for tag_name in app['tags']:
tag_name = tag_name.title()
tag_sub = etree.SubElement(sub, 'tag')
tag_sub.text = tag_name
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]:
# append data from scraped_meta files:
# these are meta files generated when scraping users
# they contain values regarding number of users playing specific game etc
import json
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\profiles0_scraped_meta.txt", 'r') as f:
json_data = json.loads(f.read())
for app in json_data:
requested_app = None
# does this app exist in xml?
# query = tree.xpath("//app[@id="+app+"]") is apparently slower
for xml_app in tree.iter('app'):
if xml_app.get('id') == app:
requested_app = xml_app
break
if requested_app == None:
print(app,'not in xml')
continue
for key in list(json_data[app].keys()):
if key == 'number': # number of players
query = requested_app.xpath(".//number_of_players")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'number_of_players')
sub.text = str(json_data[app]['number'])
else:
sub = query[0]
sub.text = str(int(sub.text) + json_data[app]['number'])
elif key == 'user': # last user from which the data was collected
query = requested_app.xpath(".//found_in_user")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'found_in_user')
else:
sub = query[0]
sub.text = str(json_data[app]['user'])
elif key == 'total_playtime': # total playtime of all users
query = requested_app.xpath(".//total_playtime")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'total_playtime')
sub.text = str(json_data[app]['total_playtime'])
else:
sub = query[0]
sub.text = str(int(sub.text) + json_data[app]['total_playtime'])
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]:
# color from PCA to color generation
from lxml import etree
from io import BytesIO
from colormath.color_objects import sRGBColor # https://anaconda.org/melund/colormath
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_colors_from_pca.csv", 'r') as f:
for line in f.readlines():
if len(line)<10:
continue
line = line.strip("\n") # appid10170,0.0,0.0196078431372549,0.4
split = line.split(',')
if len(split)!=4:
print('line seems errorenous:',line)
continue
app = split[0]
app = app.strip('appid')
color_from_pca = sRGBColor(split[1],split[2],split[3]).get_rgb_hex()
#color_from_pca = colour.rgb2hex((split[1],split[2],split[3]))
#find app
requested_app = None
for xml_app in tree.iter('app'):
if xml_app.get('id') == app:
requested_app = xml_app
break
if requested_app == None:
print(app,'not in xml')
continue
query = requested_app.xpath(".//colors")
if len(query) == 0:
sub = etree.SubElement(requested_app, 'colors')
else:
sub = query[0]
query = sub.xpath(".//color[@type='pca']")
if len(query) == 0:
color = etree.SubElement(sub, 'color')
color.set('type','pca')
else:
color = query[0]
color.text = color_from_pca
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]:
# create a 'standardized size' field which takes in number of players and distributes it between 0 and 1
# in such a way that they're equally distributed
from lxml import etree
from io import BytesIO
import numpy as np
import math
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
numbers = []
for sub in tree.iter('number_of_players'):
numbers.append(math.log(int(sub.text)))
amin = np.amin(numbers)
amax = np.amax(numbers)
for app in tree.iter('app'):
query = app.xpath(".//number_of_players")
if len(query) == 0:
continue
else:
sub = query[0]
standardized_size = math.log(int(sub.text))
standardized_size = (standardized_size - amin)/(amax - amin)
print(sub.text, standardized_size)
query = app.xpath(".//node_sizes")
if len(query) == 0:
sub = etree.SubElement(app, 'node_sizes')
else:
sub = query[0]
query = app.xpath(".//size[@type='std_number_of_players']")
if len(query) == 0:
sub2 = etree.SubElement(sub, 'size')
else:
sub2 = query[0]
sub2.set('type','std_number_of_players')
sub2.text = str(standardized_size)
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]:
# dump all ids into one file (to have something to scrap with appdata_from_appids_spider.py)
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_appids_from_xml.txt', 'w') as f:
for app in tree.iter('app'):
appid = app.get('id')
f.write(appid+'\n')
In [8]:
# get position from tsne
from lxml import etree
from io import BytesIO
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
xml = f.read()
tree = etree.parse(BytesIO(xml))
f_len = 0
i = 0
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_tsne3dim.csv', 'r') as f:
for line in f.readlines():
f_len+=1
print('items:',f_len)
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_tsne3dim.csv', 'r') as f:
for line in f.readlines():
if (i+1)%100 == 0:
print("progress: %.2f" % (i * 100 / f_len) + "%")
if len(line)<10:
continue
i += 1
line = line.strip("\n") # appid10,43.68485200657752,17.50947333605341,12.692957234610187
split = line.split(',')
if len(split)!=4:
print('line seems errorenous:',line)
continue
appid = split[0]
appid = appid.strip('appid')
#print(appid)
app = tree.xpath('//app[@id="'+str(appid)+'"]')[0]
query = app.xpath(".//positions")
if len(query) == 0:
sub = etree.SubElement(app, 'positions')
else:
sub = query[0]
query = sub.xpath(".//position[@type='tsne']")
if len(query) == 0:
sub2 = etree.SubElement(sub, 'position')
sub2.set('type','tsne')
else:
sub2 = query[0]
sub2.set('x',split[1])
sub2.set('y',split[2])
sub2.set('z',split[3])
i += 1
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))
In [ ]: