San Francisco Crime Classification :

A Kaggle Competition


In [4]:
# loading the data set

# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas
import csv
import os
from scipy.sparse import *
from scipy import *

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import *

In [80]:
os.chdir('/Users/Vamsi/Downloads/')
with open('train.csv', 'rb') as f0:
    reader_rowcount = csv.reader(f0)
    row_count = sum(1 for row in reader_rowcount)
    
with open('train.csv', 'rb') as f1:    
    reader = csv.reader(f1)
    data_list = []
    for row in reader:
        data_list.append(row)
    length = len(data_list[0])
    data_np = np.empty((row_count,length),dtype='S256')  
    for i in range(len(data_list)):
        for j in range(length):
            data_np[i,j]  = data_list[i][j]
    train_data_pre = np.delete(data_np,1,1)
    train_data = train_data_pre[1:,:]
    train_labels  = data_np[1:,1]
      
    train_data_df = pandas.DataFrame(data=train_data,columns=train_data_pre[0,:])
    
    train_data_df_dict = train_data_df.T.to_dict().values()
    dv = DictVectorizer()
    sp_matrix = (dv.fit_transform(train_data_df_dict))

    
with open('test.csv', 'rb') as ftest0:
    reader_rowcount = csv.reader(ftest0)
    row_count = sum(1 for row in reader_rowcount)
    
with open('test.csv', 'rb') as ftest:    
    reader = csv.reader(ftest)
    data_list = []
    for row in reader:
        data_list.append(row)
    length = len(data_list[0])
    data_np = np.empty((row_count,length),dtype='S256')  
    for i in range(len(data_list)):
        for j in range(length):
            data_np[i,j]  = data_list[i][j]
    test_data_pre = np.delete(data_np,0,1)
    test_data = test_data_pre[1:,:]
      
    test_data_df = pandas.DataFrame(data=test_data,columns=test_data_pre[0,:])
    test_data_df_dict = test_data_df.T.to_dict().values()
    test_matrix = (dv.transform(test_data_df_dict))
    
    le_train = LabelEncoder()
    labels = le_train.fit_transform(train_labels)

#    dev_data_df = pandas.DataFrame(data=train_data[5000:10000],columns=train_data_pre[0,:])
#    
#    dev_data_df_dict = dev_data_df.T.to_dict().values()
    
#    dev_matrix = (dv.transform(dev_data_df_dict))
         
    logR = LogisticRegression()
    logR.fit(sp_matrix,labels)
    
    pred = logR.predict(test_matrix)
    print pred
#    score = logR.score(test_matrix,labels[5000:10000])
#    print score

    
    prob = logR.predict_proba(test_matrix)
    header_pre = np.unique(train_labels)
    header_pre1 = np.insert(header_pre,0,'Id')
    header  = np.reshape(header_pre1,(header_pre1.shape[0]))
    
    ID_pre = np.arange(start=0,stop=prob.shape[0],step=1)
    ID = np.reshape(ID_pre,(prob.shape[0],1))
    
    np.set_printoptions(precision=3,suppress=True)
    prob_real = np.concatenate((ID,prob),axis=1)
    print prob_real
    print prob_real.shape

    header_dump = ','.join('%s' %header[string] for string in xrange(header.shape[0]))
    
    a = np.asarray(prob_real)
    print a
    np.savetxt("foo.csv", a, delimiter=",",header=header_dump,fmt=['%d','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f'])


[21 21 21 ..., 21 21 21]
[[      0.          0.006       0.084 ...,       0.012       0.009
        0.052]
 [      1.          0.005       0.07  ...,       0.01        0.008
        0.048]
 [      2.          0.004       0.063 ...,       0.013       0.006
        0.032]
 ..., 
 [ 884259.          0.004       0.062 ...,       0.01        0.006
        0.035]
 [ 884260.          0.006       0.073 ...,       0.009       0.01
        0.06 ]
 [ 884261.          0.006       0.052 ...,       0.011       0.013
        0.027]]
(884262, 40)
[[      0.          0.006       0.084 ...,       0.012       0.009
        0.052]
 [      1.          0.005       0.07  ...,       0.01        0.008
        0.048]
 [      2.          0.004       0.063 ...,       0.013       0.006
        0.032]
 ..., 
 [ 884259.          0.004       0.062 ...,       0.01        0.006
        0.035]
 [ 884260.          0.006       0.073 ...,       0.009       0.01
        0.06 ]
 [ 884261.          0.006       0.052 ...,       0.011       0.013
        0.027]]

In [ ]: