Working with JSON files

JSON is one of the most populat data storage types nowadays. To put in short, it is a collection of Python disctionaries inside a list.


In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
from pprint import pprint
from bs4 import BeautifulSoup

from selenium import webdriver
import re
import json
from lxml import html

JSON


In [12]:
#example json file
my_json = [{"name":"Hrant","surname":"Davtyan"},
           {"name":"Davit","surname":"Abgaryan"}]

In [13]:
url = "http://api.open-notify.org/astros.json"

In [17]:
response = requests.get(url)
response.content


Out[17]:
b'{"message": "success", "people": [{"craft": "ISS", "name": "Oleg Artemyev"}, {"craft": "ISS", "name": "Andrew Feustel"}, {"craft": "ISS", "name": "Richard Arnold"}, {"craft": "ISS", "name": "Sergey Prokopyev"}, {"craft": "ISS", "name": "Alexander Gerst"}, {"craft": "ISS", "name": "Serena Aunon-Chancellor"}], "number": 6}'

In [18]:
type(response.content)


Out[18]:
bytes

In [29]:
# if website (API) provides not HTML but directly JSON,
# you can easily get that with requests
my_json = response.json()

In [33]:
pprint(my_json)


{'message': 'success',
 'number': 6,
 'people': [{'craft': 'ISS', 'name': 'Oleg Artemyev'},
            {'craft': 'ISS', 'name': 'Andrew Feustel'},
            {'craft': 'ISS', 'name': 'Richard Arnold'},
            {'craft': 'ISS', 'name': 'Sergey Prokopyev'},
            {'craft': 'ISS', 'name': 'Alexander Gerst'},
            {'craft': 'ISS', 'name': 'Serena Aunon-Chancellor'}]}

In [41]:
#get names from the JSON above
[my_json["people"][i]["name"] for i in range(len(my_json["people"]))]


Out[41]:
['Oleg Artemyev',
 'Andrew Feustel',
 'Richard Arnold',
 'Sergey Prokopyev',
 'Alexander Gerst',
 'Serena Aunon-Chancellor']

In [42]:
#shorter code for getting names
[i["name"] for i in my_json["people"]]


Out[42]:
['Oleg Artemyev',
 'Andrew Feustel',
 'Richard Arnold',
 'Sergey Prokopyev',
 'Alexander Gerst',
 'Serena Aunon-Chancellor']

In [44]:
#pandas can read JSON as well, however it loses the structure,
#if the JSON document is nested as show below
df = pd.read_json(url)

In [45]:
df


Out[45]:
message number people
0 success 6 {'craft': 'ISS', 'name': 'Oleg Artemyev'}
1 success 6 {'craft': 'ISS', 'name': 'Andrew Feustel'}
2 success 6 {'craft': 'ISS', 'name': 'Richard Arnold'}
3 success 6 {'craft': 'ISS', 'name': 'Sergey Prokopyev'}
4 success 6 {'craft': 'ISS', 'name': 'Alexander Gerst'}
5 success 6 {'craft': 'ISS', 'name': 'Serena Aunon-Chancel...

In [53]:
#saving JSON into file
with open("my_json.json","w") as f:
    json.dump(my_json,f,indent=4)

In [55]:
#reading JSON from a file
with open('my_json.json',"r") as f:
    data = json.load(f)

In [50]:
pprint(data)


{'message': 'success',
 'number': 6,
 'people': [{'craft': 'ISS', 'name': 'Oleg Artemyev'},
            {'craft': 'ISS', 'name': 'Andrew Feustel'},
            {'craft': 'ISS', 'name': 'Richard Arnold'},
            {'craft': 'ISS', 'name': 'Sergey Prokopyev'},
            {'craft': 'ISS', 'name': 'Alexander Gerst'},
            {'craft': 'ISS', 'name': 'Serena Aunon-Chancellor'}]}

In [75]:
#creating a string of dictionaires
json_str =   '''
{"message": "success",
"number": 6,
"people": [{"craft": "ISS", "name": "Oleg Artemyev"},
            {"craft": "ISS", "name": "Oleg Artemyev"}]}'''

In [74]:
str(my_json)


Out[74]:
"{'message': 'success', 'people': [{'craft': 'ISS', 'name': 'Oleg Artemyev'}, {'craft': 'ISS', 'name': 'Andrew Feustel'}, {'craft': 'ISS', 'name': 'Richard Arnold'}, {'craft': 'ISS', 'name': 'Sergey Prokopyev'}, {'craft': 'ISS', 'name': 'Alexander Gerst'}, {'craft': 'ISS', 'name': 'Serena Aunon-Chancellor'}], 'number': 6}"

In [60]:
print(json_str)



{'message': 'success',
 'number': 6,
 'people': [{'craft': 'ISS', 'name': 'Oleg Artemyev'},
            {'craft': 'ISS', 'name': 'Andrew Feustel'},
            {'craft': 'ISS', 'name': 'Richard Arnold'},
            {'craft': 'ISS', 'name': 'Sergey Prokopyev'},
            {'craft': 'ISS', 'name': 'Alexander Gerst'},
            {'craft': 'ISS', 'name': 'Serena Aunon-Chancellor'}]}


In [77]:
#loading JSON from string
new_json = json.loads(json_str)

In [80]:
pprint(new_json)


{'message': 'success',
 'number': 6,
 'people': [{'craft': 'ISS', 'name': 'Oleg Artemyev'},
            {'craft': 'ISS', 'name': 'Oleg Artemyev'}]}

XML


In [163]:
#sample XML document
my_xml = """
<person class="Dilijan">
    <name>Hrant</name>
    <surname>Davtyan</surname>
</person>
<person>
    <name>Davit</name>
    <surname>Abgaryan</surname>
</person>
"""

In [83]:
#the XML above would lokk like this if it was converted to JSON
xml_json = """
{"person":
     {"name":"Hrant",
      "surname":"Davtyan"}},
{"person":
     {"name":"Davit",
      "surname":"Abgaryan"}}
"""

In [124]:
tree = html.document_fromstring(my_xml)
type(tree)


Out[124]:
lxml.html.HtmlElement

In [128]:
#similar to get_text() in BS
tree.text_content().replace("\n","").strip()


Out[128]:
'Hrant    Davtyan    Davit    Abgaryan'

In [129]:
#similar to find_all in BS
tree.xpath("//person/name")


Out[129]:
[<Element name at 0x1d4d0e43f98>, <Element name at 0x1d4d0e43598>]

In [130]:
my_names = [i.text_content() for i in tree.xpath("//person/name")]

In [131]:
print(my_names)


['Hrant', 'Davit']

In [135]:
#similar to select() in BS
tree.cssselect("person")


Out[135]:
[<Element person at 0x1d4d0e490e8>, <Element person at 0x1d4d0e498b8>]

In [170]:
tree.xpath("//person[@class='Dilijan']")[0].text_content()


Out[170]:
'\n    Hrant\n    Davtyan\n'

In [171]:
tree.xpath("//person/@class")


Out[171]:
['Dilijan']

In [ ]: