In [21]:
#pandas is commonly imported as pd
import pandas as pd
#We'll import the other libraries as needed
In [24]:
print("Split on comma as strings")
csv_row = '1,2.0,Three point five,True'
print(csv_row.split(','))
print("\nSplit on comma and converted to ints")
csv_row = '1,2,3,4,5,6,7,8,9'
print([int(c) for c in csv_row.split(',')])
But there are more than just strings or rows with a single data type. We can use the csv library to handle edge cases and memory management for us.
In [11]:
import csv
with open('data/volcanoes.csv') as fin:
csv_data = csv.reader(fin)
#csv.reader is a generator
#for row in csv_data: do stuff
print(csv_data.__next__())
print()
print(csv_data.__next__())
In [16]:
with open('data/volcanoes.csv') as fin:
csv_dict_data = csv.DictReader(fin)
for row in csv_dict_data:
print(row)
print()
print('Name:', row['Name'], '\nType:', row['Type'])
break
In [31]:
csv_df = pd.read_csv(open('data/volcanoes.csv'))
csv_df.head()
Out[31]:
In [19]:
import openpyxl
# You can also use xlrd, xlsxwriter, and a host of others.
# See http://www.python-excel.org/ for all your options.
wb = openpyxl.Workbook()
ws = wb.create_sheet("NewSheet")
ws['A1'] = "Name"
ws['A2'] = "Michael"
ws['B1'] = "fav_color"
ws['B2'] = "Purple"
wb.save("data/ExcelData.xlsx")
excel_row = 'data/ExcelData.xlsx'
data = openpyxl.load_workbook(excel_row)
for sheet in data:
for row in sheet:
print(row)
print(*[cell.value for cell in row])
In [30]:
file = "data/volcanoes.xlsx"
# Pandas excel loader is built off of several other excel readers,
# such as openXLRD and xlsxwriter
# this is reflected in how many ways there are to read in an excel file.
basicLoad = pd.read_excel(file)
alternateLoad = pd.ExcelFile(file)
basicLoad.head()
Out[30]:
In [33]:
# read_excel defaults to read the first sheet in an excel book
# For a comprehensive list of parameters for read_excel, see: http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_excel.html
# you can specify the sheets you want by name
sheetByName = pd.read_excel(file, sheetname="volcanoes")
# by index
sheetsByIndex = pd.read_excel(file, sheetname=[0])
# if you don't know which sheets you want, you can specify header=None and
# all sheets will be loaded in a nested structure:
allSheetsByHeader = pd.read_excel(file, header=None)
allSheetsBySheets = pd.read_excel(file, sheetname=0)
# You can skip rows or columns
cols = ['Number', 'Name', 'Country', 'Region', 'Type', 'Activity Evidence',
'Last Known Eruption', 'Latitude', 'Longitude', 'Elevation (Meters)',
'Dominant Rock Type', 'Tectonic Setting']
subset = pd.read_excel(file, skip_footer=5, skiprows=2, names=cols)
In [69]:
import json
color_data = [
{"name": "Michael", "fav_color": "purple"},
{"name": "Casey", "fav_color": "turquoise"}
]
#load, loads, dump, and dumps are common library APIs
#load/dump tkae file objects while loads/dumps take strings
print(json.dumps(color_data))
In [54]:
json_data = json.load(open('data/volcanoes.json'))
print('Rows:', len(json_data))
In [52]:
json_df = pd.read_json('data/volcanoes.json')
json_df.head()
Out[52]:
In [60]:
import xml.etree.ElementTree as ET
tree = ET.parse('data/volcano.xml')
#ET.fromstring(some_xml_string)
root = tree.getroot()
for item in root:
print(item.tag, ' -- ', item.text)
In [67]:
import yaml
yaml_data = yaml.load(open('data/volcanoes.yml'))
print('Rows:', len(yaml_data))
In [68]:
yaml_df = pd.io.json.json_normalize(yaml.load(open('data/volcanoes.yml')))
yaml_df.head()
Out[68]:
In [72]:
import pickle
print('Raw hex format:\n', pickle.dumps(color_data))
pickle.dump(color_data, open('data/colors.pkl', 'wb'))
new_data = pickle.load(open('data/colors.pkl', 'rb'))
print("\nSame? :", color_data == new_data)
In [46]:
color_df = pd.DataFrame(pd.read_pickle('data/colors.pkl'))
color_df
Out[46]:
In [47]:
color_df.to_pickle('data/color_df.pkl')
In [37]:
#Create full JSON from CSV
with open('data/volcanoes.csv') as fin:
data = [dict(row) for row in csv.DictReader(fin)]
json.dump(data, open('data/volcanoes.json', 'w'))
In [ ]:
#Create single element XML from the JSON data
#This library is not included
from dicttoxml import dicttoxml
print(dicttoxml(json_data[0]), file=open('data/volcano.xml'))
In [64]:
#Create full YAML from the JSON data
yaml.dump(json_data, open('data/volcanoes.yml', 'w'))