Doing things the hardway - can't we just use pandas?
If things work you can use high level tools and never care... but what about when they don't work?
Pandas objects do take additional memory overhead.
Sample problems - code and discussion (depending on interest)
In [ ]:
#Using dir() and help()
import pandas
In [ ]:
for x in range(10):
# do various things here...
y = x*x
print("y: ", y)
In [ ]:
import logging
# create the logger
logger = logging.getLogger('my_process')
logger.setLevel(logging.DEBUG)
# set up file for debug level messages
file_handler = logging.FileHandler('process.log')
file_handler.setLevel(logging.DEBUG)
logger.addHandler(file_handler)
# setup console for errors only
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.ERROR)
logger.addHandler(console_handler)
logger.debug("This goes only to the file")
logger.error("This only goes to the console and the file")
In [ ]:
print(open('process.log').read())
In [ ]:
import pdb
for x in range(10):
# do various things here...
y = x*x
pdb.set_trace()
In [ ]:
import sys
sys.byteorder
In [ ]:
# example of binary files
import struct
answer = 42
month = 5
day = 6
# pack 3 ints into binary
buffer = struct.pack("III", answer, month, day)
print("Buffer as bytes:", buffer)
# write to file in binary mode
open("test_binary", "wb").write(buffer)
print("Decoded to string :", buffer.decode())
print("Unpacked to tuple:", struct.unpack("III", buffer))
# read from file in binary mode
buffer2 = open("test_binary", "rb").read()
print("Read from file:", buffer2)
print(int(buffer[0]))
print(buffer[0])
In [ ]:
b = [0, 9, 32, 48, 65, 66, 67]
b_string = bytes(b)
print(b_string.decode())
In [ ]:
# text file example
open("test", "w", newline='').write("this is\nä\x80\ff\r\ntest\xc3\x28")
print("this is\nä\x80\ff\r\ntest\xc3\x28")
In [ ]:
text = open("test", "r", newline='\r\n').read()
text2 = open("test2", "r").read()
print(text2)
text2
In [ ]:
moby = []
with open("moby_dick_01.txt") as ch01:
for line in ch01:
moby.append(line)
print(moby[:10])
Problem - London weather
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_fwf.html
In [ ]:
with open("heathrowdata.txt") as LHR_input:
for row in LHR_input:
print(row)
In [ ]:
lines = LHR.split("\n")
header_1 = lines[0].split()
header_2 = [""] * 2 + lines[1].split()
print(header_2)
header = ["\n".join(x) for x in zip(header_1, header_2)]
print(header)
#df = pandas.read_fwf(LHR)
records = [dict(zip(header, [y for y in line.split() ])) for line in lines[2:]]
print(records[2])
df = pandas.DataFrame([x.split() for x in lines])
df2 = pandas.DataFrame(records[2:])
print(df.shape)
print(df2)
help(pandas.read_fwf)
In [ ]:
with open("temp_data_01.txt") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("temp_data_01.csv") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("temp_data_pipes_01.txt") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("Meteorite_Landings.tsv") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("london.json") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("landslide.json") as input_file:
for row in input_file:
print(row)
In [ ]:
open("test2", "wb").write(b"this,is,a\ntest\x00,null,file")
import csv
for x in csv.reader(open("test2", "r")):
print(x)
# Cleaning NULL (\x00) bytes from a data file
fi = open('my.csv', 'rb')
data = fi.read()
fi.close()
fo = open('mynew.csv', 'wb')
fo.write(data.replace('\x00', ''))
fo.close()
# alternative
reader = csv.reader(x.replace('\0', '') for x in mycsv)
In [ ]:
with open("chicago.json") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("mars.json") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("landslide.xml") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("observations.xml") as input_file:
for row in input_file:
print(row)
In [ ]:
with open("weather_01.xml") as input_file:
for row in input_file:
print(row)
In [ ]:
In [ ]:
import sys
import struct
print(get_size(LHR))
print(get_size(lines))
print(get_size(records))
print(get_size(df2))
df2.info(memory_usage='deep')
In [ ]:
def get_size(obj, seen=None):
"""Recursively finds size of objects"""
size = sys.getsizeof(obj)
if seen is None:
seen = set()
obj_id = id(obj)
if obj_id in seen:
return 0
# Important mark as seen *before* entering recursion to gracefully handle
# self-referential objects
seen.add(obj_id)
if isinstance(obj, dict):
size += sum([get_size(v, seen) for v in obj.values()])
size += sum([get_size(k, seen) for k in obj.keys()])
elif hasattr(obj, '__dict__'):
size += get_size(obj.__dict__, seen)
elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
size += sum([get_size(i, seen) for i in obj])
return size
(Example: based on Grainger product feed -> MongoDB, combining 4 files)
In [ ]:
import sqlalchemy
# SQLAlchemy example
In [ ]:
#FTP example
In [ ]:
#SFTP example
In [ ]:
# API example