Some notes:
Notebook author: Zack Nagler
In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import grid_search
from sklearn.grid_search import GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score
import sklearn
import statsmodels.formula.api as smf
from textblob import TextBlob
from __future__ import division
#viz
import matplotlib
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
#example
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.linear_model import SGDClassifier
import pymysql
In [3]:
rows = []
with open('full_conversations.txt') as fp:
for line in fp:
row = line.split(" +++$+++ ")
rows.append(row)
print len(rows)
cols = ["docket",
"id",
"after_prev",
"speaker",
"is_justice",
"justice_vote", #want this (5)
"presentation_side", #want this (6)
"utterance", #want this (7)
]
pd.set_option('display.max_colwidth', 40)
df = pd.DataFrame(rows)
df.columns = cols
df.head()
Out[3]:
In [4]:
counts = df[df.is_justice=="JUSTICE"].speaker.value_counts()
counts
Out[4]:
In [5]:
speakers = counts[counts>100].index.values
print(speakers)
df = df[df.speaker.isin(speakers)]
len(df)
Out[5]:
In [6]:
pd.set_option('display.max_colwidth', -1)
df[df.speaker=="JUSTICE ROBERTS"].utterance
Out[6]:
In [7]:
pd.set_option('display.max_colwidth', 40)
df.shape
Out[7]:
In [8]:
df.presentation_side.value_counts()
Out[8]:
In [9]:
sides = ["PETITIONER","RESPONDENT"]
df = df[(df.presentation_side.isin(sides)) & (df.justice_vote.isin(sides)) & (df.is_justice=="JUSTICE") ]
In [10]:
df.shape
Out[10]:
In [11]:
df.justice_vote.value_counts()
Out[11]:
In [12]:
df.head()
Out[12]:
In [13]:
df.presentation_side = df.presentation_side.map({"PETITIONER": 1, "RESPONDENT": 0})
df.justice_vote = df.justice_vote.map({"PETITIONER": 1, "RESPONDENT": 0})
In [14]:
df.describe()
Out[14]:
In [15]:
df.head()
# def polarize(data):
# return TextBlob(data).polarity
# df["polarity"] = df.utterance.apply(polarize)
Out[15]:
In [16]:
rows = []
for docket in df.docket.unique():
cond_a = (df.docket == docket)
for speaker in speakers:
cond_b = (df.speaker == speaker)
if len(df[(cond_a)&(cond_b)].presentation_side.unique())!=2: continue
justice_vote = df[(cond_a)&(cond_b)].justice_vote.head(1).values[0]
row = [docket,speaker,justice_vote]
for presentation_side in [0,1]:
cond_c = (df.presentation_side == presentation_side)
temp_df = df[(cond_a) & (cond_b) & (cond_c)]
utterances = temp_df.utterance
# print(utterances.head(1).values)
text = " ".join(utterances.tolist()).replace('\n', ' ').replace('--', '')
row.append(text)
rows.append(row)
In [17]:
cols = ["docket",
"speaker",
"justice_vote",
"pres0_text",
"pres1_text",
]
print len(rows)
df2 = pd.DataFrame(rows)
df2.columns = cols
df2.head()
Out[17]:
In [18]:
###### Naive Bayes ########
X0 = df2.pres0_text
y0 = df2.justice_vote
X1 = df2.pres1_text
y1 = df2.justice_vote
nb_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
nb_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
'clf__alpha': (1e-2, 1e-3),
}
nb_gs = GridSearchCV(nb_pipeline, nb_parameters, n_jobs=-1)
nb0_gs = nb_gs.fit(X0,y0)
nb0_best_parameters, nb0_score, _ = max(nb0_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(nb0_best_parameters.keys()):
print("%s: %r" % (param_name, nb0_best_parameters[param_name]))
print("nb0 score: " + str(nb0_score))
nb1_gs = nb_gs.fit(X1,y1)
nb1_best_parameters, nb1_score, _ = max(nb1_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(nb1_best_parameters.keys()):
print("%s: %r" % (param_name, nb1_best_parameters[param_name]))
print("nb1 score: " + str(nb1_score))
print "Dummy score: " + str(y0[y0==y0.mode().values[0]].size/y0.size)
In [19]:
###### Support Vector Machine ########
X0 = df2.pres0_text
y0 = df2.justice_vote
X1 = df2.pres1_text
y1 = df2.justice_vote
sv_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42)),
])
sv_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
sv_gs = GridSearchCV(sv_pipeline, sv_parameters, n_jobs=-1)
sv0_gs = sv_gs.fit(X0,y0)
sv0_best_parameters, sv0_score, _ = max(sv0_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(sv0_best_parameters.keys()):
print("%s: %r" % (param_name, sv0_best_parameters[param_name]))
print("sv0 score: " + str(sv0_score))
sv1_gs = sv_gs.fit(X1,y1)
sv1_best_parameters, sv1_score, _ = max(sv1_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(sv1_best_parameters.keys()):
print("%s: %r" % (param_name, sv1_best_parameters[param_name]))
print("sv1 score: " + str(sv1_score))
print "Dummy score: " + str(y0[y0==y0.mode().values[0]].size/y0.size)
In [20]:
###### LOGISTIC REGRESSION ########
X0 = df2.pres0_text
y0 = df2.justice_vote
X1 = df2.pres1_text
y1 = df2.justice_vote
lr_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', LogisticRegression()),
])
lr_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
lr_gs = GridSearchCV(lr_pipeline, lr_parameters, n_jobs=-1)
lr0_gs = lr_gs.fit(X0,y0)
lr0_best_parameters, lr0_score, _ = max(lr0_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(lr0_best_parameters.keys()):
print("%s: %r" % (param_name, lr0_best_parameters[param_name]))
print("lr0 score: " + str(lr0_score))
lr1_gs = lr_gs.fit(X1,y1)
lr1_best_parameters, lr1_score, _ = max(lr1_gs.grid_scores_, key=lambda x: x[1])
for param_name in sorted(lr1_best_parameters.keys()):
print("%s: %r" % (param_name, lr1_best_parameters[param_name]))
print("lr1 score: " + str(lr1_score))
print "Dummy score: " + str(y0[y0==y0.mode().values[0]].size/y0.size)
In [21]:
for speaker in speakers:
subframe = df2[df2.speaker==speaker]
if len(subframe) < 10: continue
print speaker+ ": " + str(len(subframe))
X = subframe.pres0_text
y = subframe.justice_vote
## Naive Bayes
nb_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
nb_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
'clf__alpha': (1e-2, 1e-3,1e-4),
}
# Tried ngrams up to (1,7) and they didn't beat (1,4)
nb_gs = GridSearchCV(nb_pipeline, nb_parameters, n_jobs=-1)
nb_gs = nb_gs.fit(X,y)
nb_best_parameters, nb_score, _ = max(nb_gs.grid_scores_, key=lambda x: x[1])
#### Support Vector
sv_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42)),
])
sv_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
sv_gs = GridSearchCV(sv_pipeline, sv_parameters, n_jobs=-1)
sv_gs = sv_gs.fit(X,y)
sv_best_parameters, sv_score, _ = max(sv_gs.grid_scores_, key=lambda x: x[1])
#### Logistic Regression
lr_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', LogisticRegression()),
])
lr_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
lr_gs = GridSearchCV(lr_pipeline, lr_parameters, n_jobs=-1)
lr_gs = lr_gs.fit(X,y)
lr_best_parameters, lr_score, _ = max(lr_gs.grid_scores_, key=lambda x: x[1])
# for param_name in sorted(parameters.keys()):
# print("%s: %r" % (param_name, nb_best_parameters[param_name]))
print "Naive Bayes score :" + str(nb_score)
print "Support Vector score :" + str(sv_score)
print "Logistic Regression score :" + str(sv_score)
print "Dummy score: " + str(y[y==y.mode().values[0]].size/y.size)
In [22]:
for speaker in speakers:
subframe = df2[df2.speaker==speaker]
if len(subframe) < 10: continue
print speaker+ ": " + str(len(subframe))
X = subframe.pres1_text
y = subframe.justice_vote
## Naive Bayes
nb_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', MultinomialNB()),
])
nb_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
'clf__alpha': (1e-2, 1e-3,1e-4),
}
# Tried ngrams up to (1,7) and they didn't beat (1,4)
nb_gs = GridSearchCV(nb_pipeline, nb_parameters, n_jobs=-1)
nb_gs = nb_gs.fit(X,y)
nb_best_parameters, nb_score, _ = max(nb_gs.grid_scores_, key=lambda x: x[1])
#### Support Vector
sv_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', SGDClassifier(loss='hinge', penalty='l2',
alpha=1e-3, n_iter=5, random_state=42)),
])
sv_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
sv_gs = GridSearchCV(sv_pipeline, sv_parameters, n_jobs=-1)
sv_gs = sv_gs.fit(X,y)
sv_best_parameters, sv_score, _ = max(sv_gs.grid_scores_, key=lambda x: x[1])
#### Logistic Regression
lr_pipeline = Pipeline([('vect', TfidfVectorizer()),
('clf', LogisticRegression()),
])
lr_parameters = {'vect__ngram_range': [(1, 1),(1,2),(1,3),(1,4)],
'vect__stop_words': ["english",None],
}
lr_gs = GridSearchCV(lr_pipeline, lr_parameters, n_jobs=-1)
lr_gs = lr_gs.fit(X,y)
lr_best_parameters, lr_score, _ = max(lr_gs.grid_scores_, key=lambda x: x[1])
# for param_name in sorted(parameters.keys()):
# print("%s: %r" % (param_name, nb_best_parameters[param_name]))
print "Naive Bayes score :" + str(nb_score)
print "Support Vector score :" + str(sv_score)
print "Logistic Regression score :" + str(sv_score)
print "Dummy score: " + str(y[y==y.mode().values[0]].size/y.size)
In [23]:
categories = ['alt.atheism', 'soc.religion.christian',
'comp.graphics', 'sci.med']
twenty_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True, random_state=42)
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(twenty_train.data)
# X_train_counts.shape
# tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
# X_train_tf = tf_transformer.transform(X_train_counts)
text_clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
docs_test = twenty_test.data
predicted = text_clf.predict(docs_test)
np.mean(predicted == twenty_test.target)
Out[23]:
In [4]:
Out[4]:
In [ ]: