In [1]:
#!/usr/bin/env python
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pdb
import csv
from dataset import *
from collections import Counter
import numpy as np
import operator
import matplotlib.pyplot as pl
import pandas as pd      
from bs4 import BeautifulSoup 
import re
from nltk.corpus import stopwords
from gensim.models import word2vec
from  names_to_arrays import *
from sknn.mlp import Classifier, Layer

In [2]:
#streets -> 10D vectors
d=data("../../data/List_of_Streets_and_Intersections.csv",categorical=["streetname","from_st","to_st"])
params_model = {"window":5, "min_count":1, "workers":5}
new_data = []
for i in d.df.index:
        a = d.df.loc[i,]
        new_data.append(review_to_words(" ".join(map(str,list(a))),keepcaracters = "[^0-9A-Z]",stops = ABREVIATIONS))
model = word2vec.Word2Vec(new_data, size = 10, **params_model)


WARNING:gensim.models.word2vec:consider setting layer size to a multiple of 4 for greater performance
['CNN' 'streetname' 'from_st' 'to_st' 'cardinal' 'addrange' 'limits'
 'location' 'theOrder' 'LF_FADD' 'RT_FADD' 'LF_TADD' 'RT_TADD' 'FROM_CNN'
 'TO_CNN']

In [2]:
"""
training_set=data("../../data/train.csv",
                  categorical=["PdDistrict","Resolution","Address","Dates","DayOfWeek"],
                  measure= ["X","Y"],
                  hidden_cluster=["Category"],
                  header = 0,
                  index=None)
"""
training_set=data("../../data/train.csv",
                  categorical=[],
                  measure= ["X","Y"],
                  hidden_cluster=["Category"],
                  header = 0,
                  index=None)
training_set.standardize_table()

In [3]:
training_set.convert_cluster_membership_to_numerical(["Category"])

In [62]:
X = training_set.df[training_set.features].values
Y = training_set.df[training_set.hidden_cluster].values
X = X[1:1000]
Y = Y[1:1000]

In [73]:
output_dimensionality = len(np.unique(Y))
"""
nn = Classifier(
    layers=[
        Layer("Linear", units=2),
        Layer("Tanh",units=output_dimensionality  ),
        Layer("Softmax",units= output_dimensionality)],
    learning_rate=0.02,
    n_iter=10)
"""
from sknn.platform import gpu32
nn = Classifier(
    layers=[
        Layer("Maxout", units=2,pieces = 2),
        Layer("Softmax",units= output_dimensionality)],
    learning_rate=0.001,
    n_iter=25)

In [74]:
#from sknn.platform import gpu32
z = nn.fit(X,Y)

In [75]:
h = z.predict(training_set.df[training_set.features].values[1:10000])
y = training_set.df[training_set.hidden_cluster].values[1:10000]
y = np.transpose(y)[0]
s = 0
for i,hh in enumerate(h):
    if hh != y[i]: s += 1
print s


7298

In [72]:


In [ ]: