In [1]:
%matplotlib inline
In [2]:
import numpy as np
import matplotlib.pyplot as plt
import csv
In [3]:
fn="/Users/qiqi/Desktop/Administrative_Discretionary_Grants 1996-2014_.csv"
with open(fn, "r") as f:
reader = csv.reader(f)
header = next(reader)
data = {}
for column in header:
data[column] = []
for row in reader:
for column, value in zip(header, row):
data[column].append(value)
In [4]:
class Dataset:
def __init__(self, data):
self.data = data.copy()
def convert(self, column, dtype):
self.data[column] = np.array(self.data[column], dtype=dtype)
def columns(self):
return self.data.keys()
def filter_eq(self, column, value):
good = (self.data[column] == value)
new_data = {}
for column in self.data:
new_data[column] = self.data[column][good]
return Dataset(new_data)
def filter_lt(self, column, value):
good = (self.data[column] < value)
new_data = {}
for column in self.data:
new_data[column] = self.data[column][good]
return Dataset(new_data)
def filter_gt(self, column, value):
good = (self.data[column] > value)
new_data = {}
for column in self.data:
new_data[column] = self.data[column][good]
return Dataset(new_data)
def filter_ne(self, column, value):
good = (self.data[column] != value)
new_data = {}
for column in self.data:
new_data[column] = self.data[column][good]
return Dataset(new_data)
def size(self):
for key in self.data:
return self.data[key].size
def split(self, column):
new_datasets = {}
for split_value in np.unique(self.data[column]):
new_datasets[split_value] = self.filter_eq(column, split_value)
return new_datasets
def stats(self):
statistics = {}
for key in self.data:
if self.data[key].dtype not in ("float", "int"):
continue
values = self.data[key]
statistics[key] = (values.min(), values.max(), values.std(), values.mean())
return statistics
def compare(self, other):
stats1 = self.stats()
stats2 = other.stats()
for column in self.columns():
if column not in stats1: continue
print("Column '{0:25s}'".format(column))
for s1, s2 in zip(stats1[column], stats2[column]):
print(" {0} vs {1}".format(s1, s2))
def plot(self, x_column, y_column):
plt.plot(self.data[x_column], self.data[y_column], '.')
In [5]:
trees = Dataset(data)
value_types = {'Fiscal Year': 'int'}
for v in trees.columns():
trees.convert(v, value_types.get(v, "str"))
In [24]:
data1=trees.filter_eq("Program Type","IA")
data2=trees.filter_eq("Program Type","IC")
data3=trees.filter_eq("Program Type","IG")
data4=trees.filter_eq("Program Type","IL")
data5=trees.filter_eq("Program Type","IM")
data6=trees.filter_eq("Program Type","IS")
data7=trees.filter_eq("Program Type","LE")
data8=trees.filter_eq("Program Type","LG")
data9=trees.filter_eq("Program Type","LI")
data10=trees.filter_eq("Program Type","LL")
data11=trees.filter_eq("Program Type","LT")
data12=trees.filter_eq("Program Type","MA")
data13=trees.filter_eq("Program Type","MG")
data14=trees.filter_eq("Program Type","MH")
data15=trees.filter_eq("Program Type","ML")
data16=trees.filter_eq("Program Type","MN")
data17=trees.filter_eq("Program Type","MP")
data18=trees.filter_eq("Program Type","NC")
data19=trees.filter_eq("Program Type","ND")
data20=trees.filter_eq("Program Type","NE")
data21=trees.filter_eq("Program Type","NG")
data22=trees.filter_eq("Program Type","NL")
data23=trees.filter_eq("Program Type","NO")
data24=trees.filter_eq("Program Type","NP")
data25=trees.filter_eq("Program Type","NR")
data26=trees.filter_eq("Program Type","RE")
data27=trees.filter_eq("Program Type","SP")
data28=trees.filter_eq("Program Type","ST")
trunk=[]
for i in range(1,29):
for item in trunk:
trunk.append(len(datai.data))
plt.hist([trunk[0],trunk[1]])
#plt.hist(data18,data19,data20,data21,data22,data23,data24,data25,data26,data27,data28)
plt.legend()
plt.grid()
plt.title("Distribution of tree trunk for five most common species")
plt.xlabel("Number of Trunks")
plt.ylabel("Count")
In [23]:
data1=trees.filter_eq("Program Type","IA")
data2=trees.filter_eq("Program Type","IC")
In [30]:
from collections import Counter
In [34]:
c = Counter(data1.data)
In [45]:
len(data1.data["Program Type"])
Out[45]:
In [48]:
trunk=[]
trunk.append(len(data1.data["Program Type"]) )
trunk.append(len(data2.data["Program Type"]) )
In [49]:
trunk
Out[49]:
In [51]:
type(trunk)
Out[51]:
In [52]:
plt.hist([trunk[0],trunk[1]])
Out[52]:
In [33]:
len(data1.data)
Out[33]:
In [19]:
trunk=[]
trunk.append(1397)
trunk.append(1121)
In [22]:
plt.hist([trunk[0],trunk[1]])
Out[22]:
In [ ]: