In [1]:
%matplotlib inline

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import csv

In [3]:
fn="/Users/qiqi/Desktop/Administrative_Discretionary_Grants 1996-2014_.csv"
with open(fn, "r") as f:
    reader = csv.reader(f)
    header = next(reader)
    data = {}
    for column in header:
        data[column] = []
    for row in reader:
        for column, value in zip(header, row):
            data[column].append(value)

In [4]:
class Dataset:
    def __init__(self, data):
        self.data = data.copy()
        
    def convert(self, column, dtype):
        self.data[column] = np.array(self.data[column], dtype=dtype)
        
    def columns(self):
        return self.data.keys()
    
    def filter_eq(self, column, value):
        good = (self.data[column] == value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_lt(self, column, value):
        good = (self.data[column] < value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_gt(self, column, value):
        good = (self.data[column] > value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def filter_ne(self, column, value):
        good = (self.data[column] != value)
        new_data = {}
        for column in self.data:
            new_data[column] = self.data[column][good]
        return Dataset(new_data)
    
    def size(self):
        for key in self.data:
            return self.data[key].size

    def split(self, column):
        new_datasets = {}
        for split_value in np.unique(self.data[column]):
            new_datasets[split_value] = self.filter_eq(column, split_value)
        return new_datasets

    def stats(self):
        statistics = {}
        for key in self.data:
            if self.data[key].dtype not in ("float", "int"):
                continue
            values = self.data[key]
            statistics[key] = (values.min(), values.max(), values.std(), values.mean())
        return statistics
    
    def compare(self, other):
        stats1 = self.stats()
        stats2 = other.stats()
        for column in self.columns():
            if column not in stats1: continue
            print("Column '{0:25s}'".format(column))
            for s1, s2 in zip(stats1[column], stats2[column]):
                print("    {0} vs {1}".format(s1, s2))
    
    def plot(self, x_column, y_column):
        plt.plot(self.data[x_column], self.data[y_column], '.')

In [5]:
trees = Dataset(data)
value_types = {'Fiscal Year': 'int'}
for v in trees.columns():
    trees.convert(v, value_types.get(v, "str"))

In [24]:
data1=trees.filter_eq("Program Type","IA")
data2=trees.filter_eq("Program Type","IC")
data3=trees.filter_eq("Program Type","IG")
data4=trees.filter_eq("Program Type","IL")
data5=trees.filter_eq("Program Type","IM")
data6=trees.filter_eq("Program Type","IS")
data7=trees.filter_eq("Program Type","LE")
data8=trees.filter_eq("Program Type","LG")
data9=trees.filter_eq("Program Type","LI")
data10=trees.filter_eq("Program Type","LL")
data11=trees.filter_eq("Program Type","LT")
data12=trees.filter_eq("Program Type","MA")
data13=trees.filter_eq("Program Type","MG")
data14=trees.filter_eq("Program Type","MH")
data15=trees.filter_eq("Program Type","ML")
data16=trees.filter_eq("Program Type","MN")
data17=trees.filter_eq("Program Type","MP")
data18=trees.filter_eq("Program Type","NC")
data19=trees.filter_eq("Program Type","ND")
data20=trees.filter_eq("Program Type","NE")
data21=trees.filter_eq("Program Type","NG")
data22=trees.filter_eq("Program Type","NL")
data23=trees.filter_eq("Program Type","NO")
data24=trees.filter_eq("Program Type","NP")
data25=trees.filter_eq("Program Type","NR")
data26=trees.filter_eq("Program Type","RE")
data27=trees.filter_eq("Program Type","SP")
data28=trees.filter_eq("Program Type","ST")


trunk=[]
for i in range(1,29):
    for item in trunk:
        trunk.append(len(datai.data))
    
plt.hist([trunk[0],trunk[1]])
#plt.hist(data18,data19,data20,data21,data22,data23,data24,data25,data26,data27,data28)
plt.legend()
plt.grid()
plt.title("Distribution of tree trunk for five most common species")
plt.xlabel("Number of Trunks")
plt.ylabel("Count")


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-24-f2633466198f> in <module>()
     58 trunk.append(data28)
     59 
---> 60 plt.hist([trunk[0],trunk[1]])
     61 #plt.hist(data18,data19,data20,data21,data22,data23,data24,data25,data26,data27,data28)
     62 plt.legend()

/Users/qiqi/anaconda/lib/python2.7/site-packages/matplotlib/pyplot.pyc in hist(x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, hold, data, **kwargs)
   3080                       histtype=histtype, align=align, orientation=orientation,
   3081                       rwidth=rwidth, log=log, color=color, label=label,
-> 3082                       stacked=stacked, data=data, **kwargs)
   3083     finally:
   3084         ax._hold = washold

/Users/qiqi/anaconda/lib/python2.7/site-packages/matplotlib/__init__.pyc in inner(ax, *args, **kwargs)
   1890                     warnings.warn(msg % (label_namer, func.__name__),
   1891                                   RuntimeWarning, stacklevel=2)
-> 1892             return func(ax, *args, **kwargs)
   1893         pre_doc = inner.__doc__
   1894         if pre_doc is None:

/Users/qiqi/anaconda/lib/python2.7/site-packages/matplotlib/axes/_axes.pyc in hist(self, x, bins, range, normed, weights, cumulative, bottom, histtype, align, orientation, rwidth, log, color, label, stacked, **kwargs)
   6190             # this will automatically overwrite bins,
   6191             # so that each histogram uses the same bins
-> 6192             m, bins = np.histogram(x[i], bins, weights=w[i], **hist_kwargs)
   6193             m = m.astype(float)  # causes problems later if it's an int
   6194             if mlast is None:

/Users/qiqi/anaconda/lib/python2.7/site-packages/numpy/lib/function_base.pyc in histogram(a, bins, range, normed, weights, density)
    497             mn, mx = a.min() + 0.0, a.max() + 0.0
    498     else:
--> 499         mn, mx = [mi + 0.0 for mi in range]
    500     if mn > mx:
    501         raise ValueError(

TypeError: unsupported operand type(s) for +: 'instance' and 'float'

In [23]:
data1=trees.filter_eq("Program Type","IA")
data2=trees.filter_eq("Program Type","IC")

In [30]:
from collections import Counter

In [34]:
c = Counter(data1.data)

In [45]:
len(data1.data["Program Type"])


Out[45]:
1397

In [48]:
trunk=[]
trunk.append(len(data1.data["Program Type"]) )
trunk.append(len(data2.data["Program Type"]) )

In [49]:
trunk


Out[49]:
[1397, 1121]

In [51]:
type(trunk)


Out[51]:
list

In [52]:
plt.hist([trunk[0],trunk[1]])


Out[52]:
(array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]),
 array([ 1121. ,  1148.6,  1176.2,  1203.8,  1231.4,  1259. ,  1286.6,
         1314.2,  1341.8,  1369.4,  1397. ]),
 <a list of 10 Patch objects>)

In [33]:
len(data1.data)


Out[33]:
36

In [19]:
trunk=[]
trunk.append(1397)
trunk.append(1121)
trunk

In [22]:
plt.hist([trunk[0],trunk[1]])


Out[22]:
(array([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]),
 array([ 1121. ,  1148.6,  1176.2,  1203.8,  1231.4,  1259. ,  1286.6,
         1314.2,  1341.8,  1369.4,  1397. ]),
 <a list of 10 Patch objects>)

In [ ]: