notebook.community

Edit and run



In [8]:

    
import pandas as pd
import csv
import sklearn.linear_model as lm
import matplotlib.pyplot as plt
import pickle
import numpy as np
%matplotlib inline



In [9]:

    
'''
Utility functions
'''
def split(X, split_size):
    np.random.shuffle(X)
    break_pt = split_size * np.shape(X)[0]
    return X[:break_pt,:], X[break_pt:,:]

# implementation notes: set NaN to mean
def normalize_features(X_train):
    mean_X_train = np.nanmean(X_train, 0)
    for i in xrange(np.shape(X_train)[1]):
        col = X_train[:,i]
        col[ np.isnan(col) ] = mean_X_train[i]
    std_X_train = np.std(X_train, 0)
    std_X_train[ std_X_train == 0 ] = 1
    X_train_normalized = (X_train - mean_X_train) / std_X_train
    return X_train_normalized

# Note: bucket edits in place
def bucket(X, cols, num_buckets):
    Y = np.copy(X)
    for col in cols:
        buckets = np.linspace(np.min(X[:,col]), np.max(X[:,col]), num=num_buckets+1)
        for i in xrange(num_buckets):
            X_col = Y[:,col]
            X_col[ (buckets[i] <= X_col) & (X_col <= buckets[i+1])] = i
            Y[:,col] = X_col
    return Y

def rmse(predict, true):
    return np.sqrt(1.0/np.shape(predict)[0] * np.sum(np.square(predict - true)))



In [10]:

    
'''
Load in data
'''
import re
import warnings

years = 2000 + np.linspace(1,15, 15)
buckets = np.zeros(1)
target_type = str  # The desired output type

for year in years:

    data_file = '../data/Crimes_-_%d.csv' % year

    with warnings.catch_warnings(record=True) as ws:
        warnings.simplefilter("always")

        data = pd.read_csv(data_file, sep=",", header=0)
        print("Warnings raised:", ws)
        # We have an error on specific columns, try and load them as string
        for w in ws:
            s = str(w.message)
            print("Warning message:", s)
            match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
            if match:
                columns = match.group(1).split(',') # Get columns as a list
                columns = [int(c) for c in columns]
                print("Applying %s dtype to columns:" % target_type, columns)
                data.iloc[:,columns] = data.iloc[:,columns].astype(target_type)

    '''
    Extract Relevant Features
    '''
    # temporal features
    date_time = np.array([x.split() for x in data.Date])
    date = date_time[:,0]
    time = date_time[:,1]
    tod = date_time[:,2]

    # month, day, year
    date = np.array([x.split('/') for x in date])
    month = [int(x) for x in date[:,0]]
    dom = [int(x) for x in date[:,1]]
    years = [int(x) for x in date[:,2]]
    time_feat = np.subtract(years, 2001)*12 + month

    # time of day
    time_c = [x.split(':') for x in time]
    time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1]) 
            if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1]) 
            for x,y in zip(time_c, tod)]

    feats = np.transpose(np.vstack((time_feat, data.Latitude, data.Longitude))).astype(float)

    # remove NaNs
    feats = feats[~(np.isnan(feats[:,1]))]

    # bucket the data
    n_buckets = 5
    data_b = bucket(feats, [1, 2], n_buckets)
    
    n_time = 12 #int(data_b[np.argmax(data_b[:,0])][0])

    #buckets = np.zeros((n_time, n_buckets, n_buckets))
    buckets2 = np.zeros((n_buckets * n_buckets * n_time, 4))

    # count the data per geographic cell per month
    for i in xrange(n_time):
        for j in xrange(n_buckets):
            for k in xrange(n_buckets):

                # note: one-indexing for months
                mo = i+1+(year-2001)*12
                count = data_b[ (data_b[:,0] == mo) & 
                                (data_b[:,1] == j) & 
                                (data_b[:,2] == k) ]
    #            buckets[i][j][k] = np.size(count,0)
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][0] = mo
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][1] = j
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][2] = k
                buckets2[i*(n_buckets * n_buckets)+j*(n_buckets)+k][3] = np.size(count,0)
    
    if np.size(buckets) == 1:
        buckets = buckets2
    else:
        buckets = np.vstack((buckets, buckets2))
    
    print "finished", year









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-10-68a6ce0fef90> in <module>()
     16         warnings.simplefilter("always")
     17 
---> 18         data = pd.read_csv(data_file, sep=",", header=0)
     19         print("Warnings raised:", ws)
     20         # We have an error on specific columns, try and load them as string

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File ../data/Crimes_-_2001.csv does not exist



In [13]:

    
with open("../../cs281_data/large_data/chicago.pkl", "r") as f:
    x = pickle.load(f)









    



---------------------------------------------------------------------------
EOFError                                  Traceback (most recent call last)
<ipython-input-13-fec2b0e8155b> in <module>()
      1 with open("../../cs281_data/large_data/chicago.pkl", "r") as f:
----> 2     x = pickle.load(f)

/usr/lib/python2.7/pickle.pyc in load(file)
   1376 
   1377 def load(file):
-> 1378     return Unpickler(file).load()
   1379 
   1380 def loads(str):

/usr/lib/python2.7/pickle.pyc in load(self)
    856             while 1:
    857                 key = read(1)
--> 858                 dispatch[key](self)
    859         except _Stop, stopinst:
    860             return stopinst.value

/usr/lib/python2.7/pickle.pyc in load_eof(self)
    878 
    879     def load_eof(self):
--> 880         raise EOFError
    881     dispatch[''] = load_eof
    882 

EOFError:



In [ ]:

    
x =