In [2]:
# this block is just for the style sheet for the notebook
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[2]:
In [4]:
# file seperator - your program needs to be multiplatfom
import os
file_name = 'data' + os.sep + 'test'
print file_name
fin = open(file_name,'rb')
lines = fin.readlines()
fin.close()
In [5]:
print lines
In [6]:
#file_name = 'data' + os.sep + 'test'
fin = open(r'data\test','r')
lines = fin.readlines()
fin.close()
In [7]:
lines
Out[7]:
In [8]:
len(lines[0])
Out[8]:
In this course we'll look at the following file types:
Data can be stored in plain text files with arbitrary delimiters (space, tabs, commas). Data does not need to be organized into an equal number of elements per "record" or line. Plain text files (also CSV files) are structured as a set of lines terminated by end of line characters, '\n'. We don't need to import any libraries to start working with files. The following functions are used frequently:
In [9]:
# open an existing file for reading
import os # needed for the file separation
file_name = 'data' + os.sep + 'text_file2.txt'
print file_name
my_file = open(file_name, 'rb')
# read the content of the whole file then close
file_content = my_file.read()
my_file.close()
# print the conent of the file
print file_content
In [10]:
my_file = open(file_name, 'rb')
In [11]:
# read 10 bytes
characters = my_file.read(10)
print 'the first 10 bytes:', characters
In [12]:
# read the next 10 bytes - recall that the end of line characted is counted as
# part of the string
characters = my_file.read(10)
print 'the second 10 bytes', characters
ord_list = [ord(x) for x in characters]
print 'length of list is: ', len(ord_list)
print 'list is ', ord_list
In [13]:
characters = my_file.read(3)
print [ord(x) for x in characters]
In [14]:
# the pointer's location after reading 20 bytes
print 'the pointer is at poistion ', my_file.tell()
In [15]:
# read the next byte
print 'the 24tht byte is ', my_file.read(1)
In [16]:
# move to the beginning of the file by using the seek function
my_file.seek(0,0)
In [17]:
# skip 20 bytes (charactes) and read the one right after
print 'after seeking to the beginning of the file we are at location:', my_file.tell()
my_file.seek(20, 1)
print 'the 20th byte is:', my_file.read(1)
In [18]:
# skip the first 3 bytes from the beginning of the file
my_file.seek(3, 0)
print 'i am @ ', my_file.tell()
In [19]:
# read the 4th byte
print 'the 4th byte is ', my_file.read(1)
print 'we are @ : ', my_file.tell()
In [20]:
# skip 10 byte from this location
my_file.seek(10, 1)
In [21]:
# get the current location and the character there
print 'we are at ', my_file.tell()
print my_file.read(1)
In [22]:
# read the rest of the file from this point on and print it
print 'the rest of the file is:\n', my_file.read()
In [23]:
# read the last 3 bytes before the end of the file
my_file.seek(-3, 2)
print my_file.read()
In [24]:
# read a byte now that we've reached the end of the file
print 'the charachter at the end of the file is ', my_file.read()
In [25]:
# close the file
my_file.close()
In [26]:
# now let's read one line at a time
# open an existing file for reading
file_name = 'data' + os.sep + 'text_file2.txt'
my_file = open(file_name, 'rb')
# read the first line
first_line = my_file.readline()
print first_line
# read the second line
second_line = my_file.readline()
print second_line
# close file
my_file.close()
In [27]:
# note: when readline() is called, a line is read and the "pointer" is
# moved to the next line within the file; reading is done sequentially.
# So if you want the third line you have to go through the first two and ignoring them.
# open an existing file for reading
file_name = 'data' + os.sep + 'text_file2.txt'
my_file = open(file_name, 'r')
# read the first and second lines and ignore them - this can go in a loop if you like
line = my_file.readline()
line = my_file.readline()
# read the third line
line = my_file.readline()
print line
# close the file
my_file.close()
In [28]:
# read the whole file as a list contating each line as an element
# open an existing file for reading
file_name = 'data' + os.sep + 'text_file2.txt'
my_file = open(file_name, 'rb')
# read the file as a list and close
data = my_file.readlines()
my_file.close()
# show the list - notice the end of line character in each element
print data
In [29]:
# loop though the list and print the lines - the EOL character will be printed as a
# new line so in the addition to the new line provided by the print statement we also
# get one from the string itself
for line in data:
print line
In [30]:
# loop though the list and print the lines without the extra EOL charachter
for line in data:
print line.strip()
In [31]:
write some data to a file - if the file does not exit then it gets created
# otherwise it gets overwritten
out_file = 'data' + os.sep + 'out_file.txt'
test_out = open(out_file, 'w')
In [32]:
# define a string
str1 = 'my first line'
str2 = 'my second line'
# write the strings
test_out.write(str1)
test_out.write(str2)
# close the file
test_out.close()
In [33]:
# if you open the file you'd see that the lines are written back to back with out
# an EOL character - to avoid this ammend a EOL to the string
out_file = 'data' + os.sep + 'out_file.txt'
test_out = open(out_file, 'w')
# define a string
str1 = 'my first line\n'
str2 = 'my second line'
# write the strings
test_out.write(str1)
test_out.write(str2)
# close the file
test_out.close()
In [34]:
# another way to write the two lines
out_file = 'data' + os.sep + 'out_file.txt'
test_out = open(out_file, 'w')
# define a string
str1 = 'my first line'
str2 = 'my second line'
# write the strings
test_out.write(str1 + '\n' + str2)
# close the file
test_out.close()
In [35]:
# append a line to the file
out_file = 'data' + os.sep + 'out_file.txt'
test_out = open(out_file, 'a')
# define a string
str1 = '\nmy 3rd line\n'
str2 = 'my 4th line'
# write the strings
test_out.write(str1 + str2)
# close the file
test_out.close()
In [36]:
# another way to open and parse a file is by using the "with" keyword. the
# advanatage of this is that it handles file closing automatically.
with open(out_file, 'rb') as fin:
read_data = fin.readlines()
# no closing the file as we did above
print read_data
for line in read_data:
print line.strip().split()
CSV files are popular since they can be open in a spreadsheet or a text editor. They are organized in a table format. Here we'll work with rectangular csv files (no data missing) and later we'll work with general text files where some data is missing, data and comments are mixed together, where headers are part of the file to let the user know what kind of data he/she is processing.
CSV file access functions
In [37]:
# create a csv file the old fashioned way (just like another text file)
# file name - look for it in the data folder
import os
csv_file = 'data' + os.sep + 'csv_file.csv'
# define lines
line1 = 'a,b,c,d,e\n'
line2 = 'f,g,h,i,j\n'
line3 = 'h,i,j,k,l'
# write the data
with open(csv_file, 'wb') as fout:
fout.write(line1 + line2 + line3)
In [38]:
# let's read the file
import csv
with open(csv_file, 'rb') as fin:
csv_reader = csv.reader(fin)
for row in csv_reader:
print row
In [39]:
# this shows the difference between reading the file using the previous methods
# we learned above and the csv way
with open(csv_file, 'rb') as fin:
read_data = fin.readlines()
# no closing the file as we did above
print read_data
In [40]:
# as you can we we got one list with each row as a string element.
# this is not what we want from a csv file.
In [41]:
# let's read the file and store the data in a list the persists after the file
# is closed
data_list = []
with open(csv_file, 'rb') as fin:
csv_reader = csv.reader(fin)
for row in csv_reader:
data_list.append(row)
In [42]:
# show the data
print data_list
In [43]:
# here is what we have when we reverse the list - we're going to write
# this back to another file
for row in reversed(data_list):
print row
In [44]:
# let's write the data to a file in reverse order for the list and for each
# element in the list
import csv
import os
csv_out_file = 'data' + os.sep + 'csv_out.csv'
fout = open(csv_out_file, 'wb')
csv_writer = csv.writer(fout)
for row in reversed(data_list):
csv_writer.writerow(row)
fout.close()
In [45]:
# here is another example
# creat another file with headers
# file name - look for it in the data folder
csv_file = 'data' + os.sep + 'csv_file2.csv'
# for generating random numbers
import random
# define a random 2d array
random_nums = [[str(random.random()) for _ in range(6)] for _ in range(6)]
# generate a string with all the numbers and new line characters
line = 'lon,lat,alt,roll,pitch,yaw\n'
for row in random_nums:
line += ','.join(row)+'\n'
# strip the last EOL character
line = line.strip() # line = line[:-1] is another way of doing it
# write the file
with open(csv_file, 'wb') as fout:
fout.write(line)
In [46]:
# now that we have the file, we can read it in one swoop into a dictionary
import csv
# the csv file we want to read
csv_file = 'data' + os.sep + 'csv_file2.csv'
# read the file into a dictionary
fin = open(csv_file, 'rb')
csv_data = csv.DictReader(fin, delimiter=',')
# let's see how the structure looks
for line in csv_data:
print line
# check if the file is closed
print fin.closed
# close it
fin.close()
# check again
print fin.closed
# let's see of we can access the csv_data again
for line in csv_data:
print line
In [47]:
# as you can see above, the DictReader parsed the file and organized the data
# into a dictionary per line with the header as the key and the data as the value
# however this data is live, which means it's a pointer to the file only so as soon
# as the file is closed we lose the data
# it also means that once we loop to the structure once we can access the data
# again
# read the file into a dictionary
fin = open(csv_file, 'rb')
csv_data = csv.DictReader(fin, delimiter=',')
# let's see how the structure looks
for line in csv_data:
print line
# this will give us nothing
for line in csv_data:
print line['yaw']
fin.close()
In [48]:
# so to get the data by column, we do this
fin = open(csv_file, 'rb')
csv_data = csv.DictReader(fin, delimiter=',')
# let's see how the structure looks
for line in csv_data:
print line['lat']
In [49]:
# let's write a csv file with headers using DictWriter
# from above we saw that the structure generated from DictReader was a list
# of dictionaries - so generate a list of dictionaries
import os
import csv
# define the list
rows = []
rows.append({'name': 'mal', 'dob': 2468, 'role': 'captain'})
rows.append({'name': 'zoe', 'dob': 2484, 'role': 'first mate'})
rows.append({'name': 'wash', 'dob': 2468, 'role': 'pilot'})
rows.append({'name': 'inara', 'dob': 2460, 'role': 'companion'})
rows.append({'name': 'jayne', 'dob': 2463, 'role': 'mercenary'})
# define the header
header = ['name', 'dob', 'role']
# open the file
with open('data' + os.sep + 'csv_output_2.csv', 'wb') as fout:
csv_writer = csv.DictWriter(fout, header)
csv_writer.writeheader()
csv_writer.writerows(rows)
In [ ]: