In [4]:
# loading the data set
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline
# General libraries.
import re
import numpy as np
import matplotlib.pyplot as plt
import pandas
import csv
import os
from scipy.sparse import *
from scipy import *
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report
# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import *
In [80]:
os.chdir('/Users/Vamsi/Downloads/')
with open('train.csv', 'rb') as f0:
reader_rowcount = csv.reader(f0)
row_count = sum(1 for row in reader_rowcount)
with open('train.csv', 'rb') as f1:
reader = csv.reader(f1)
data_list = []
for row in reader:
data_list.append(row)
length = len(data_list[0])
data_np = np.empty((row_count,length),dtype='S256')
for i in range(len(data_list)):
for j in range(length):
data_np[i,j] = data_list[i][j]
train_data_pre = np.delete(data_np,1,1)
train_data = train_data_pre[1:,:]
train_labels = data_np[1:,1]
train_data_df = pandas.DataFrame(data=train_data,columns=train_data_pre[0,:])
train_data_df_dict = train_data_df.T.to_dict().values()
dv = DictVectorizer()
sp_matrix = (dv.fit_transform(train_data_df_dict))
with open('test.csv', 'rb') as ftest0:
reader_rowcount = csv.reader(ftest0)
row_count = sum(1 for row in reader_rowcount)
with open('test.csv', 'rb') as ftest:
reader = csv.reader(ftest)
data_list = []
for row in reader:
data_list.append(row)
length = len(data_list[0])
data_np = np.empty((row_count,length),dtype='S256')
for i in range(len(data_list)):
for j in range(length):
data_np[i,j] = data_list[i][j]
test_data_pre = np.delete(data_np,0,1)
test_data = test_data_pre[1:,:]
test_data_df = pandas.DataFrame(data=test_data,columns=test_data_pre[0,:])
test_data_df_dict = test_data_df.T.to_dict().values()
test_matrix = (dv.transform(test_data_df_dict))
le_train = LabelEncoder()
labels = le_train.fit_transform(train_labels)
# dev_data_df = pandas.DataFrame(data=train_data[5000:10000],columns=train_data_pre[0,:])
#
# dev_data_df_dict = dev_data_df.T.to_dict().values()
# dev_matrix = (dv.transform(dev_data_df_dict))
logR = LogisticRegression()
logR.fit(sp_matrix,labels)
pred = logR.predict(test_matrix)
print pred
# score = logR.score(test_matrix,labels[5000:10000])
# print score
prob = logR.predict_proba(test_matrix)
header_pre = np.unique(train_labels)
header_pre1 = np.insert(header_pre,0,'Id')
header = np.reshape(header_pre1,(header_pre1.shape[0]))
ID_pre = np.arange(start=0,stop=prob.shape[0],step=1)
ID = np.reshape(ID_pre,(prob.shape[0],1))
np.set_printoptions(precision=3,suppress=True)
prob_real = np.concatenate((ID,prob),axis=1)
print prob_real
print prob_real.shape
header_dump = ','.join('%s' %header[string] for string in xrange(header.shape[0]))
a = np.asarray(prob_real)
print a
np.savetxt("foo.csv", a, delimiter=",",header=header_dump,fmt=['%d','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f','%.2f'])
In [ ]: