In [1]:
import pickle
import numpy
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
numpy.random.seed(42)
words_file = "../text_learning/your_word_data.pkl"
authors_file = "../text_learning/your_email_authors.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )
In [2]:
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
print 'training data:',features_train.shape
print 'testing data:',features_test.shape
In [3]:
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
In [4]:
clfOverfit = DecisionTreeClassifier()
clfOverfit.fit(features_train,labels_train)
Out[4]:
In [5]:
clfOverfit.score(features_test,labels_test)
Out[5]:
In [6]:
for i in range(clfOverfit.feature_importances_.shape[0]):
if clfOverfit.feature_importances_[i] > 0.1:
print 'feature',i,':',clfOverfit.feature_importances_[i]
In [7]:
vectorizer.get_feature_names()[33614]
Out[7]:
This looks rather like part of someone's email signature.
In [8]:
words_file = "../text_learning/your_word_data2.pkl"
authors_file = "../text_learning/your_email_authors2.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
In [9]:
clfOverfit = DecisionTreeClassifier()
clfOverfit.fit(features_train,labels_train)
Out[9]:
New accuracy is still very high
In [10]:
clfOverfit.score(features_test,labels_test)
Out[10]:
In [11]:
for i in range(clfOverfit.feature_importances_.shape[0]):
if clfOverfit.feature_importances_[i] > 0.1:
print 'feature',i,':',clfOverfit.feature_importances_[i]
In [12]:
print vectorizer.get_feature_names()[8674]
print vectorizer.get_feature_names()[14343]
First one looks like a timestamp, second seems to be signature. Removing signature.
In [14]:
words_file = "../text_learning/your_word_data3.pkl"
authors_file = "../text_learning/your_email_authors3.pkl"
word_data = pickle.load( open(words_file, "r"))
authors = pickle.load( open(authors_file, "r") )
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]
In [15]:
clfOverfit = DecisionTreeClassifier()
clfOverfit.fit(features_train,labels_train)
Out[15]:
Notable drop in accuracy:
In [16]:
clfOverfit.score(features_test,labels_test)
Out[16]:
In [17]:
for i in range(clfOverfit.feature_importances_.shape[0]):
if clfOverfit.feature_importances_[i] > 0.1:
print 'feature',i,':',clfOverfit.feature_importances_[i]
These look like (probably) content
In [18]:
print vectorizer.get_feature_names()[11975]
print vectorizer.get_feature_names()[18849]
print vectorizer.get_feature_names()[21323]