In [29]:
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
In [8]:
train = pd.read_csv('train.tsv', delimiter='\t')
test = pd.read_csv('test.tsv', delimiter='\t')
In [10]:
train.shape, test.shape
Out[10]:
In [20]:
train.head()
Out[20]:
In [13]:
test.head()
Out[13]:
In [15]:
# unique sentiment labels
train.Sentiment.unique()
Out[15]:
In [21]:
train.info()
In [22]:
train.Sentiment.value_counts()
Out[22]:
In [23]:
train.Sentiment.value_counts() / train.Sentiment.count()
Out[23]:
In [24]:
X_train = train['Phrase']
y_train = train['Sentiment']
In [30]:
text_clf = Pipeline([
('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', LogisticRegression())
])
text_clf = text_clf.fit(X_train, y_train)
In [31]:
X_test = train['Phrase']
predicted = text_clf.predict(X_test)
In [32]:
print (np.mean(predicted == y_train))
In [34]:
test.info()
In [35]:
X_test = test['Phrase']
phraseIds = test['PhraseId']
predicted = text_clf.predict(X_test)
output = pd.DataFrame( data={"PhraseId":phraseIds, "Sentiment":predicted} )
#output.to_csv( "submission_logistic_regression.csv", index=False, quoting=3 )