In [136]:
from collections import OrderedDict
import csv
from dateutil.parser import parse
import datetime,time
class DataFrame(object):
@classmethod
def from_csv(cls, file_path, delimiting_character=',', quote_character='"'):
with open(file_path, 'rU') as infile:
reader = csv.reader(infile, delimiter=delimiting_character, quotechar=quote_character)
data = []
for row in reader:
data.append(row)
return cls(list_of_lists=data)
def __init__(self, list_of_lists, header=True):
if header:
self.header = list_of_lists[0]
self.data = list_of_lists[1:]
else:
self.header = ['column' + str(index + 1) for index, column in enumerate(list_of_lists[0])]
self.data = list_of_lists
############# task 1 #############
if len(self.header) != len(set(self.header)):
raise Exception('There are duplicates!!!')
############# end task 1 #############
############# task 2 #############
self.data=[[s.strip() for s in row] for row in self.data]
############# end task 2 #############
self.data = [OrderedDict(zip(self.header, row)) for row in self.data]
def __getitem__(self, item):
# this is for rows only
if isinstance(item, (int, slice)):
return self.data[item]
# this is for columns only
elif isinstance(item, str):
return [row[item] for row in self.data]
# this is for rows and columns
elif isinstance(item, tuple):
if isinstance(item[0], list) or isinstance(item[1], list):
if isinstance(item[0], list):
rowz = [row for index, row in enumerate(self.data) if index in item[0]]
else:
rowz = self.data[item[0]]
if isinstance(item[1], list):
if all([isinstance(thing, int) for thing in item[1]]):
return [[column_value for index, column_value in enumerate([value for value in row.itervalues()]) if index in item[1]] for row in rowz]
elif all([isinstance(thing,str) for thing in item[1]]):
return [[row[column_name] for column_name in item[1]] for row in rowz]
else:
raise TypeError('What the hell is this?')
else:
return [[value for value in row.itervalues()][item[1]] for row in rowz]
else:
if isinstance(item[1], (int, slice)):
return [[value for value in row.itervalues()][item[1]] for row in self.data[item[0]]]
elif isinstance(item[1],str):
return [row[item[1]] for row in self.data[item[0]]]
else:
raise TypeError('I don\'t know how to handle this...')
# only for lists of column names
elif isinstance(item, list):
return [[row[column_name] for column_name in item] for row in self.data]
def get_rows_where_column_has_value(self, column_name, value, index_only=False):
if index_only:
return [index for index, row_value in enumerate(self[column_name]) if row_value==value]
else:
return [row for row in self.data if row[column_name]==value]
def min(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
print min(nums)
except ValueError:
try:
nums=[datetime.datetime.strptime(row[col_name], '%m/%d/%y %H:%M') for row in self.data]
nums=[time.mktime(num.timetuple()) for num in nums]
print time.strftime('%m-%d-%y %H:%M',time.localtime(min(nums)))
except:
print ('Cannot be calculated')
def max(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
print max(nums)
except ValueError:
try:
nums=[datetime.datetime.strptime(row[col_name], '%m/%d/%y %H:%M') for row in self.data]
nums=[time.mktime(num.timetuple()) for num in nums]
print time.strftime('%m-%d-%y %H:%M',time.localtime(max(nums)))
except:
print ('Cannot be calculated')
def median(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
nums = sorted(nums)
center = int(len(nums) / 2)
if len(nums) % 2 == 0:
print sum(nums[center - 1:center + 1]) / 2.0
else:
print nums[center]
except ValueError:
try:
nums=[datetime.datetime.strptime(row[col_name], '%m/%d/%y %H:%M') for row in self.data]
nums=[time.mktime(num.timetuple()) for num in nums]
nums = sorted(nums)
center = int(len(nums) / 2)
if len(nums) % 2 == 0:
print time.strftime('%m-%d-%y %H:%M',time.localtime(sum(nums[center - 1:center + 1]) / 2.0))
else:
print time.strftime('%m-%d-%y %H:%M',time.localtime(nums[center]))
except:
print ('Cannot be calculated')
def mean(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
count = len(nums)
print sum(nums) / count
except ValueError:
try:
nums=[datetime.datetime.strptime(row[col_name], '%m/%d/%y %H:%M') for row in self.data]
nums=[time.mktime(num.timetuple()) for num in nums]
count = len(nums)
print time.strftime('%m-%d-%y %H:%M',time.localtime(sum(nums) / count))
except:
print ('Cannot be calculated')
def sum(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
print sum(nums)
except ValueError:
print ('Cannot be calculated')
def std(self,col_name):
try:
nums = [float(row[col_name].replace(',','')) for row in self.data]
mean = sum(nums)/len(nums)
print sum((x-mean)**2/len(nums) for x in nums)**0.5
except ValueError:
print ('Cannot be calculated')
############# end task 3 #############
############# HW2 task 4 #############
def add_rows(self, list_of_lists):
for new_list in list_of_lists:
if len(new_list) == len(self.header):
new_row = [OrderedDict(zip(self.header, row)) for row in list_of_lists]
self.data = self.data + new_row
return self
else:
print ('Wrong number of len')
############# end task 4 #############
############# HW2 task 5 #############
def add_column(self, list_of_lists, col_name):
if len(list_of_lists) == len(self.data):
self.header=self.header+column_name
new_col_dict = [OrderedDict(zip(new_col_head,row)) for row in list_of_lists]
for l in range(len(data)):
for rowz in self.data:
rowz = self.data[l].update(new_col_dict[l])
return self
else:
print ("Wrong number of column")
############# end task 5 #############
In [137]:
df = DataFrame.from_csv('SalesJan2009.csv')
In [138]:
value=df[3]
value
Out[138]:
In [139]:
x='Last_Login'
test1 = df.min(x)
test2 = df.max(x)
test3 = df.median(x)
test4 = df.sum(x)
test5 = df.mean(x)
test6 = df.std(x)
test1
test2
test3
test4
test5
test6
In [145]:
In [146]:
Out[146]:
In [ ]: