In [1]:
# Load data from sf
import pandas as pd
import numpy as np
from sklearn import linear_model
from matplotlib import pyplot as plt

In [2]:
import seaborn as sns

In [4]:
sfdata = pd.read_csv("../data/sf.csv")

In [5]:
len(sfdata.PdDistrict.unique())


Out[5]:
10

In [6]:
sfdata.PdDistrict.unique()


Out[6]:
array(['MISSION', 'PARK', 'INGLESIDE', 'SOUTHERN', 'TENDERLOIN',
       'NORTHERN', 'TARAVAL', 'RICHMOND', 'CENTRAL', 'BAYVIEW'], dtype=object)

In [7]:
test = sfdata
test['CrimeCount'] = np.ones(len(sfdata))

In [8]:
test = sfdata.groupby('PdDistrict')
aggregate = test.aggregate(np.sum)

In [11]:
sns.countplot(x="PdDistrict", data=sfdata)


Out[11]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5ef8808550>

In [12]:
plt.show()

In [14]:
# Convert date to actual date format. This might take a while!
sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x, errors='raise'))
sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x, errors='raise'))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-14-bd0286b5678b> in <module>()
      1 # Convert date to actual date format. This might take a while!
----> 2 sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x))
      3 sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x))

/usr/local/lib/python2.7/dist-packages/pandas/core/series.pyc in apply(self, func, convert_dtype, args, **kwds)
   2047             values = lib.map_infer(values, lib.Timestamp)
   2048 
-> 2049         mapped = lib.map_infer(values, f, convert=convert_dtype)
   2050         if len(mapped) and isinstance(mapped[0], Series):
   2051             from pandas.core.frame import DataFrame

pandas/src/inference.pyx in pandas.lib.map_infer (pandas/lib.c:56990)()

<ipython-input-14-bd0286b5678b> in <lambda>(x)
      1 # Convert date to actual date format. This might take a while!
----> 2 sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x))
      3 sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x))

/usr/local/lib/python2.7/dist-packages/pandas/tseries/tools.pyc in to_datetime(arg, errors, dayfirst, utc, box, format, exact, coerce, unit, infer_datetime_format)
    340         return _convert_listlike(arg, box, format)
    341 
--> 342     return _convert_listlike(np.array([ arg ]), box, format)[0]
    343 
    344 class DateParseError(ValueError):

/usr/local/lib/python2.7/dist-packages/pandas/tseries/tools.pyc in _convert_listlike(arg, box, format)
    317                 result = tslib.array_to_datetime(arg, raise_=errors == 'raise',
    318                                                  utc=utc, dayfirst=dayfirst,
--> 319                                                  coerce=coerce, unit=unit)
    320 
    321             if com.is_datetime64_dtype(result) and box:

pandas/tslib.pyx in pandas.tslib.array_to_datetime (pandas/tslib.c:25993)()

pandas/tslib.pyx in pandas.tslib.parse_datetime_string (pandas/tslib.c:24196)()

/usr/local/lib/python2.7/dist-packages/dateutil/parser.pyc in parse(timestr, parserinfo, **kwargs)
   1006         return parser(parserinfo).parse(timestr, **kwargs)
   1007     else:
-> 1008         return DEFAULTPARSER.parse(timestr, **kwargs)
   1009 
   1010 

/usr/local/lib/python2.7/dist-packages/dateutil/parser.pyc in parse(self, timestr, default, ignoretz, tzinfos, **kwargs)
    390             res, skipped_tokens = self._parse(timestr, **kwargs)
    391         else:
--> 392             res = self._parse(timestr, **kwargs)
    393 
    394         if res is None:

/usr/local/lib/python2.7/dist-packages/dateutil/parser.pyc in _parse(self, timestr, dayfirst, yearfirst, fuzzy, fuzzy_with_tokens)
    563                             res.second = int(s[12:])
    564 
--> 565                     elif ((i < len_l and info.hms(l[i]) is not None) or
    566                           (i+1 < len_l and l[i] == ' ' and
    567                            info.hms(l[i+1]) is not None)):

/usr/local/lib/python2.7/dist-packages/dateutil/parser.pyc in hms(self, name)
    291     def hms(self, name):
    292         try:
--> 293             return self._hms[name.lower()]
    294         except KeyError:
    295             return None

KeyboardInterrupt: 

In [ ]:
def buckets(series, n):
    # Takes a series and returns a series mapping each element to a
    # one of n buckets.
    mi, ma = series.min(), series.max()
    buckets = np.linspace(mi, ma, n)
    print buckets
    def f(e):
        for i, el in enumerate(buckets):
            if e < el:
                return i
        return n - 1
            
    res = series.map(f)
    return res

def cleanColumns(data):
    # Used to rename the columns in our data grame to their appropriate names.
    # Also drops unnecessary columns.
    data['Latitude'] = data['Y']
    data['Longitude'] = data['X']
    data['Type'] = data['Category']
    
    print data.columns
    data = data.drop(['IncidntNum', 'Category', 'Descript', 'PdDistrict','Resolution','Address','X','Y', 'Location'], axis=1)
    
    return data

def createPartitions(data, n):
    # Remove outliers from the latitude/longitude issues
    # We know that the city lies between -130, -120 longitude
    # We also know that the citiy lies between 37 and 40 degrees latitude
    data = data[-120 > data.Longitude][data.Longitude > (-130)]
    data = data[data.Latitude > 37][data.Latitude < 40]
    
    # Each row is an occurrance of a single crime. 
    # Keep around the original data
    data['Region'] =  n *  buckets(data['Latitude'], n) + buckets(data['Longitude'],n) + 1
    
    # Add in the types into the results.
    mapping = {key: i for i,key in enumerate(data['Type'].unique())}
    data['TypeIndex'] = data['Type'].map(mapping)

    # Now we can add the crime counts. 
    data['CrimeCount'] = np.ones(len(data))
    return data

def extractDateFeatures(data):
    # Creates a new data frame and returns it as copy with all the data that we're interested in
    # Create map from week days to integers
    DayOfWeek = {'Sunday': 1,
                 'Monday': 2,
                 'Tuesday': 3,
                 'Wednesday': 4,
                 'Thursday': 5,
                 'Friday': 6,
                 'Saturday': 7 }
    data['DoW'] = data['DayOfWeek'].map(DayOfWeek)
    data = data.drop('DayOfWeek', axis=1)
    print "Created Weeks"
    
    # We assume that the Date column is already in datetime format
    data['Month'] = data.Date.map(lambda x: x.month)
    data['DoM'] = data.Date.map(lambda x: x.day)
    data['Year'] = data.Date.map(lambda x: x.year) - data.Date.min().year
    data['ToD'] = data.Time.map(lambda x: x.minute)
    data['Time'] = data.Time.map(lambda x: x.value / 10**9) - data.Date.min().value / 10**9
    
    # We add an additional column that combines the month and the year into number of months since beginning
    data['TimeFeature'] = data.ix[:, ['Year', 'Month']].apply(lambda s: 12*s[0] + s[1], axis=1)
    
    data = data.drop('Date', axis=1)
    
    print "Created the time data features!"
    
    return data

def extractDataFeatures(data, n):
    # Given the input data read directly from the exported data 
    # (https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z)
    # We convert it into the format specified as follows:
    # We want the results to be a data frame with the following columns.:
    # -> Latitude (float)
    # -> Longtitude (float)
    # -> Region (specifies which region this data point belongs to)
    # -> DoW (1-7 results) (Day of the Week)
    # -> Month (1-12) (Month of the Year)
    # -> DoM (1-31) (Day of the Month)
    # -> Year (0->max(year) - min(year)) 
    # -> ToD (Time of Day, specified as number of minutes since start of day)
    # -> Time (int) : #minutes since earliest sample in the data set
    # -> Type (string) : Described the type of crime
    # -> TypeIndex (int): Index mapping uniquely each crime type to a value in [1...#crime types]
    # -> CrimeCount (int) : The number of crimes in this area
    
    # Remove unnecessary columns and rename
    cData = cleanColumns(data)
    
    # Created partitions. Note that this modifies the data by adding a REGION column.
    partitionedData = createPartitions(cData, n)
    
    # Now we convert the data to the correct dates and clean the data!
    finalData = extractDateFeatures(partitionedData)   
    
    # Return the results
    return finalData

In [ ]:
def countCrime(d, region):
        '''
        Counts the crime in a given region, returning an array of size 144 with halucinated empty data 
        for non-existent crime periods.
        '''
        res = np.zeros(144)
        for i in range(144):
            try:
                # print d.ix[region, i].CrimeCount
                res[i] = d.ix[region, i].CrimeCount
            except (KeyError, AttributeError, IndexError) as e:
                pass
    
        return res

In [ ]:
d = train.groupby(['Region', 'TimeFeature']).aggregate(np.sum)

In [ ]:
d.ix[1]

In [ ]:
def splitTrainTest(D, year=None):
    '''
    Given a data frame with the specified data, we split into a training set and a test set.
    The test data consists of us holding out a particular year from our training data.
    '''
    # We only want to keep some of the data
    D = D.ix[:, ['Year', 'TimeFeature', 'CrimeCount', 'Region']]
    
    # First, we're going to seperate by region.
    if year is None:
        test = D[D['Year'] == D['Year'].max()]
        train = D[D['Year'] != D['Year'].max()]
    else:
        test = D[D['Year'] == year]
        train = D[D['Year'] != year]
        
    # Now we keep only a subset of the columns we want, which is TimeFeature and CrimeCount
    train = train.drop('Year', axis=1)
    test = test.drop('Year', axis=1)

    print "Finished Creating Testing Set"
    
    # Now we groupby TimeFeature which consists of YearMonth string.
    trainD, testD = train.groupby(['Region', 'TimeFeature']).aggregate(np.sum), test.groupby([ 'Region', 'TimeFeature']).aggregate(np.sum)
    # return trainD
    trainRes = {}
    testRes = {}
    for region in range(1,D.Region.max()+1):
        # Training
        trainRes[region] = np.zeros((144,2))
        trainRes[region][:,0] = range(144)
        # print trainRes[region]
        trainRes[region][:,1] = countCrime(trainD, region)
        
        # Test 
        testRes[region] = np.zeros((144,2))
        testRes[region][:,0] = range(144) 
        testRes[region][:,1] = countCrime(testD, region)
    
    return trainRes, trainRes

In [ ]:
results = extractDataFeatures(sfdata, 10)

In [ ]:
def plotHistogram(results, n):
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    ax.hist(results.Region, bins=range(n*n))
    plt.xlabel("San Francisco Region")
    plt.ylabel("Total Incidents of Reported Incidents")
    plt.title("Crime Incidents in San Francisco when Divided into {}x{} grid.".format(n,n))
    plt.savefig("figures/sf_n{}".format(n))
    plt.close(fig)

In [ ]:
plotHistogram(results, 10)

In [ ]:
train, test = splitTrainTest(results)

In [ ]:
def getRMSE(x,y):
    return np.sqrt(np.sum((y - x)**2) / len(y))

def trainAndTest(train, test):
    # Given a set of training and testing data, train a linear regression model, tests it, and then 
    # computes the RMSE of each region, returning the resulting dictionary of RMSEs for region, as well as
    # a dictionary of predictions, and a dictionary of trained models
    models = {}
    RMSE = {}
    predictions = {}
    for region in train:
        if region % 10 == 0:
            print "Training on region {}".format(region)
        model = LinearRegression()
        x = train[region][:,0]
        x = x.reshape((len(x),1))
        y = train[region][:,1]
        y = y.reshape((len(y),1))
        model.fit(x,y)
        xTest = test[region][:,0]
        xTest = xTest.reshape((len(xTest),1))
        yTest = test[region][:,1]
        yTest = yTest.reshape((len(yTest),1))
        preds = model.predict(xTest)
        rmse = getRMSE(yTest, preds)
        
        # store the results
        models[region] = model
        RMSE[region] = rmse
        predictions[region] = preds
    
    return models, RMSE, predictions

In [ ]:
models, RMSE, predictions = trainAndTest(train, test)

In [ ]:
sum(RMSE.values()) / len(RMSE)

In [ ]:
tRes = createPartitions(results, 1)

In [ ]:
tRes

In [ ]:
test

In [ ]:
# We try different values of n and calculate the average RMSE for each value!
# Note that results already has the data extracted
rmses = []
max_rmses = []
min_rmses = []
for n in range(1,10) + range(10,50,10):
    # We only need to extract the data once, which results has already done (for n = 10)
    tRes = createPartitions(results, n)
    # This saves a plot of the distribution as a histogram
    plotHistogram(tRes, n)
    
    # Now we split into train and test
    train, test = splitTrainTest(tRes)
    
    # Now we caculate the average RMSE
    _, RMSE, _ = trainAndTest(train, test)
    print "RMSE: {}".format(sum(RMSE.values()) / len(RMSE))
    rmses.append(sum(RMSE.values()) / len(RMSE))
    max_rmses.append(max(RMSE.values()))
    min_rmses.append(min(RMSE.values()))

In [ ]:
min_rmses

In [ ]:
x =range(1,10) + range(10,50,10)
plt.scatter(x, rmses, color='red')
plt.scatter(x, max_rmses, color='blue')
plt.title("RMSE vs Resolution")
plt.xlabel("Geographic Resolution (n)")
plt.ylabel("Average and Max RMSE")
plt.show()

In [ ]:
zip(x,rmses)

In [ ]: