xml_utils



In [ ]:
# TEST!

from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))

test = {}
for app in tree.iter('app'):
    try:
        if app.xpath('.//number_of_players')[0].text != None:
            if app.get('scraped') != '1':
                test[app.get('id')] = int(app.xpath('.//number_of_players')[0].text)
    except:
        pass

sorted(test.items(), key=lambda x: x[1], reverse=True)

In [ ]:
#tags test. i'm worried that there will be many cases of almost identical tags.
#but apparently that's not the case
#cartoon - cartoony

from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))
    
tags = {}

for tag in tree.iter('tag'):
    if tag.text in list(tags.keys()):
        tags[tag.text] += 1
    else:
        tags[tag.text] = 1
    
    
sorted(tags.items(), key=lambda x: x)

In [ ]:
# initialize from http://api.steampowered.com/ISteamApps/GetAppList/v0001/
# WARNING: THIS WILL WIPE ALL DATA
import requests
import json
import sys
from lxml import etree

request = 'http://api.steampowered.com/ISteamApps/GetAppList/v0001/'
response = requests.get(request)
if len(response.text)<1000:
    sys.exit("something wrong with request")
json_response = json.loads(response.text)
root = etree.Element("root")
for r_app in json_response['applist']['apps']['app']:
    app = etree.SubElement(root, 'app')
    app.set('id',str(r_app['appid']))
    name = etree.SubElement(app, 'name')
    name.text = str(r_app['name'])
tree = etree.ElementTree(root)
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))

In [ ]:
# append data from _appids_scrapped files:
# it's a file of api responses
# requests = appids found in users libraries

import json
import datetime
from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))
    
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_appids_scraped.json", 'r') as f:
    json_data = json.loads(f.read())

for app in json_data:
    requested_app = None
    
    for xml_app in tree.iter('app'):
        if xml_app.get('id') == app['appid']:
            requested_app = xml_app
            break
    if requested_app == None:
        print(app, 'not in xml')
        continue
    
    requested_app.set('scraped', '1')
    
    for key in list(app.keys()):
        if key == 'requested_appid': # id as it was requested by scraper, not as was given in response
            if app['requested_appid'] != app['appid']:
                query = requested_app.xpath(".//redirected_from")
                if len(query) == 0:
                    sub = etree.SubElement(requested_app, 'redirected_from')
                else:
                    sub = query[0]
                red_id = etree.SubElement(sub, 'redirected_from')
                red_id.text = app['requested_appid']
                
                # then flag the original:
                app_to_flag = None
                for xml_app2 in tree.iter('app'):
                    if xml_app2.get('id') == app['requested_appid']:
                        app_to_flag = xml_app2
                        break
                if app_to_flag == None:
                    print(app['requested_appid'], 'not in xml')
                    continue
                app_to_flag.set('redirects_to_id',app['requested_appid'])
        
        elif key == 'title': # title as scraped
            #name from json?
            if requested_app.xpath(".//name")[0].text != app['title']:
                query = requested_app.xpath(".//name_scraped")
                if len(query) == 0:
                    sub = etree.SubElement(requested_app, 'name_scraped')
                else:
                    sub = query[0]
                sub.text = str(app['title'])
                #print("names don't match! found in xml:",requested_app.xpath(".//name")[0].text,'found in json:', app['title'])

        elif key == 'release_date': # it's a list
            query = requested_app.xpath(".//release_date")
            if len(query) == 0:
                sub = etree.SubElement(requested_app, 'release_date')
            else:
                sub = query[0]
            try:
                date = app['release_date'][0]
                # let's reformat date to YYYY-MM-DD format
                try:
                    date_formatted = datetime.datetime.strptime(date, "%d %b, %Y").strftime("%Y-%m-%d")
                except ValueError:
                    try:
                        date_formatted = datetime.datetime.strptime(date, "%b %Y").strftime("%Y-%m-%d")
                    except ValueError: #wrong date format
                        date_formatted = None
            except IndexError: #sometimes there's no date
                date_formatted = None
            sub.text = date_formatted
                
        elif key == 'price': # price in EUR, it's a list
            query = requested_app.xpath(".//price")
            if len(query) == 0:
                sub = etree.SubElement(requested_app, 'price')
            else:
                sub = query[0]
            try:
                price = app['price'][0]
            except IndexError: # sometimes there's no price
                price = None
            sub.text = price
            if price != None:
                price = price.replace(',','.')
                if price != '0.00':
                    sub.set('currency', app['price_currency'][0])
            
        elif key == 'developer': # may contain several developers, it's the same for and publishers and tags
            query = requested_app.xpath(".//developers")
            if len(query) != 0:
                sub = query[0]
                requested_app.remove(sub)
            sub = etree.SubElement(requested_app, 'developers')
            for dev_name in app['developer']:
                dev_name = dev_name.title()
                dev_sub = etree.SubElement(sub, 'developer')
                dev_sub.text = dev_name
                
        elif key == 'publisher':
            query = requested_app.xpath(".//publishers")
            if len(query) != 0:
                sub = query[0]
                requested_app.remove(sub)
            sub = etree.SubElement(requested_app, 'publishers')
            for pub_name in app['publisher']:
                pub_name = pub_name.title()
                pub_sub = etree.SubElement(sub, 'publisher')
                pub_sub.text = pub_name
                
        elif key == 'tags':
            query = requested_app.xpath(".//tags")
            if len(query) != 0:
                sub = query[0]
                requested_app.remove(sub)
            sub = etree.SubElement(requested_app, 'tags')
            for tag_name in app['tags']:
                tag_name = tag_name.title()
                tag_sub = etree.SubElement(sub, 'tag')
                tag_sub.text = tag_name
                
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))

In [ ]:
# append data from scraped_meta files:
# these are meta files generated when scraping users
# they contain values regarding number of users playing specific game etc

import json
from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))
    
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\profiles0_scraped_meta.txt", 'r') as f:
    json_data = json.loads(f.read())
    
for app in json_data:
    
    requested_app = None
    
    # does this app exist in xml?
    # query = tree.xpath("//app[@id="+app+"]")  is apparently slower
    for xml_app in tree.iter('app'):
        if xml_app.get('id') == app:
            requested_app = xml_app
            break
    if requested_app == None:
        print(app,'not in xml')
        continue
    
    for key in list(json_data[app].keys()):
        if key == 'number': # number of players
            query = requested_app.xpath(".//number_of_players")
            if len(query) == 0:
                sub = etree.SubElement(requested_app, 'number_of_players')
                sub.text = str(json_data[app]['number'])
            else:
                sub = query[0]
                sub.text = str(int(sub.text) + json_data[app]['number'])
        elif key == 'user': # last user from which the data was collected
            query = requested_app.xpath(".//found_in_user")
            if len(query) == 0:
                sub = etree.SubElement(requested_app, 'found_in_user')
            else:
                sub = query[0]
            sub.text = str(json_data[app]['user'])
        elif key == 'total_playtime': # total playtime of all users
            query = requested_app.xpath(".//total_playtime")
            if len(query) == 0:
                sub = etree.SubElement(requested_app, 'total_playtime')
                sub.text = str(json_data[app]['total_playtime'])
            else:
                sub = query[0]
                sub.text = str(int(sub.text) + json_data[app]['total_playtime'])

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))

In [ ]:
# color from PCA to color generation

from lxml import etree
from io import BytesIO
from colormath.color_objects import sRGBColor # https://anaconda.org/melund/colormath

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))
    
with open("C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_colors_from_pca.csv", 'r') as f:
    for line in f.readlines():
        if len(line)<10:
            continue
        line = line.strip("\n") # appid10170,0.0,0.0196078431372549,0.4
        split = line.split(',')
        if len(split)!=4:
            print('line seems errorenous:',line)
            continue
        app = split[0]
        app = app.strip('appid')
        color_from_pca = sRGBColor(split[1],split[2],split[3]).get_rgb_hex()
        #color_from_pca = colour.rgb2hex((split[1],split[2],split[3]))
        
        #find app
        requested_app = None
        for xml_app in tree.iter('app'):
            if xml_app.get('id') == app:
                requested_app = xml_app
                break
        if requested_app == None:
            print(app,'not in xml')
            continue
            
        query = requested_app.xpath(".//colors")
        if len(query) == 0:
            sub = etree.SubElement(requested_app, 'colors')
        else:
            sub = query[0]
        query = sub.xpath(".//color[@type='pca']")
        if len(query) == 0:
            color = etree.SubElement(sub, 'color')
            color.set('type','pca')
        else:
            color = query[0]
        color.text = color_from_pca

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))

In [ ]:
# create a 'standardized size' field which takes in number of players and distributes it between 0 and 1
# in such a way that they're equally distributed

from lxml import etree
from io import BytesIO
import numpy as np
import math

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))

numbers = []

for sub in tree.iter('number_of_players'):
    numbers.append(math.log(int(sub.text)))
amin = np.amin(numbers)
amax = np.amax(numbers)

for app in tree.iter('app'):
    query = app.xpath(".//number_of_players")
    if len(query) == 0:
        continue
    else:
        sub = query[0]
    standardized_size = math.log(int(sub.text))
    standardized_size = (standardized_size - amin)/(amax - amin)
    print(sub.text, standardized_size)
    
    query = app.xpath(".//node_sizes")
    if len(query) == 0:
        sub = etree.SubElement(app, 'node_sizes')
    else:
        sub = query[0]
    query = app.xpath(".//size[@type='std_number_of_players']")
    if len(query) == 0:
        sub2 = etree.SubElement(sub, 'size')
    else:
        sub2 = query[0]
    sub2.set('type','std_number_of_players')
    sub2.text = str(standardized_size)

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))

In [ ]:
# dump all ids into one file (to have something to scrap with appdata_from_appids_spider.py)
from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_appids_from_xml.txt', 'w') as f:
    for app in tree.iter('app'):
        appid = app.get('id')
        f.write(appid+'\n')

In [8]:
# get position from tsne
from lxml import etree
from io import BytesIO

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'rb') as f:
    xml = f.read()
    tree = etree.parse(BytesIO(xml))
    
f_len = 0
i = 0
with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_tsne3dim.csv', 'r') as f:
    for line in f.readlines():
        f_len+=1

print('items:',f_len)

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_tsne3dim.csv', 'r') as f:
    for line in f.readlines():
        if (i+1)%100 == 0:
            print("progress: %.2f" % (i * 100 / f_len) + "%")
        
        if len(line)<10:
            continue
            i += 1
        line = line.strip("\n") # appid10,43.68485200657752,17.50947333605341,12.692957234610187
        split = line.split(',')
        if len(split)!=4:
            print('line seems errorenous:',line)
            continue
        appid = split[0]
        appid = appid.strip('appid')
        #print(appid)
        app = tree.xpath('//app[@id="'+str(appid)+'"]')[0]
        query = app.xpath(".//positions")
        if len(query) == 0:
            sub = etree.SubElement(app, 'positions')
        else:
            sub = query[0]
        query = sub.xpath(".//position[@type='tsne']")
        if len(query) == 0:
            sub2 = etree.SubElement(sub, 'position')
            sub2.set('type','tsne')
        else:
            sub2 = query[0]
        sub2.set('x',split[1])
        sub2.set('y',split[2])
        sub2.set('z',split[3])
            
        i += 1

with open('C:\\Users\\Admin\\Documents\\GitHub\\GamesGraph\\scripts\\wip_data\\_working.xml', 'wb') as f:
    f.write(etree.tostring(tree, pretty_print=True, xml_declaration=True))


items: 9537
progress: 1.04%
progress: 2.09%
progress: 3.14%
progress: 4.18%
progress: 5.23%
progress: 6.28%
progress: 7.33%
progress: 8.38%
progress: 9.43%
progress: 10.47%
progress: 11.52%
progress: 12.57%
progress: 13.62%
progress: 14.67%
progress: 15.72%
progress: 16.77%
progress: 17.81%
progress: 18.86%
progress: 19.91%
progress: 20.96%
progress: 22.01%
progress: 23.06%
progress: 24.11%
progress: 25.15%
progress: 26.20%
progress: 27.25%
progress: 28.30%
progress: 29.35%
progress: 30.40%
progress: 31.45%
progress: 32.49%
progress: 33.54%
progress: 34.59%
progress: 35.64%
progress: 36.69%
progress: 37.74%
progress: 38.79%
progress: 39.83%
progress: 40.88%
progress: 41.93%
progress: 42.98%
progress: 44.03%
progress: 45.08%
progress: 46.13%
progress: 47.17%
progress: 48.22%
progress: 49.27%
progress: 50.32%
progress: 51.37%
progress: 52.42%
progress: 53.47%
progress: 54.51%
progress: 55.56%
progress: 56.61%
progress: 57.66%
progress: 58.71%
progress: 59.76%
progress: 60.81%
progress: 61.85%
progress: 62.90%
progress: 63.95%
progress: 65.00%
progress: 66.05%
progress: 67.10%
progress: 68.15%
progress: 69.19%
progress: 70.24%
progress: 71.29%
progress: 72.34%
progress: 73.39%
progress: 74.44%
progress: 75.48%
progress: 76.53%
progress: 77.58%
progress: 78.63%
progress: 79.68%
progress: 80.73%
progress: 81.78%
progress: 82.82%
progress: 83.87%
progress: 84.92%
progress: 85.97%
progress: 87.02%
progress: 88.07%
progress: 89.12%
progress: 90.16%
progress: 91.21%
progress: 92.26%
progress: 93.31%
progress: 94.36%
progress: 95.41%
progress: 96.46%
progress: 97.50%
progress: 98.55%
progress: 99.60%

In [ ]: