In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn import datasets
import cPickle as pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize
from nltk.stem import snowball
stemmer = snowball.SnowballStemmer("english")
from scipy import sparse
import matplotlib.pyplot as plt
%matplotlib inline
import sys


/home/mgupta/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)

# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)

X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]

In [3]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

In [ ]:


In [4]:
param = {'max_depth':2,
             'eta':0.5,
             'silent':1,
             'objective':'binary:logistic',
             'eval_metric':'auc'
             }
num_round = 5
watchlist = [(xg_train, 'train'), (xg_test, 'eval')]
results = dict()

In [5]:
model = xgb.train(param,
                  xg_train,
                  num_round,
                  watchlist,
                  evals_result=results,
                  maximize=False,
                  verbose_eval=True)


[0]	train-auc:0.603368	eval-auc:0.582255
[1]	train-auc:0.673926	eval-auc:0.644343
[2]	train-auc:0.725316	eval-auc:0.682996
[3]	train-auc:0.775957	eval-auc:0.723403
[4]	train-auc:0.81698	eval-auc:0.756981

In [6]:
model.predict(xg_test)


Out[6]:
array([ 0.37586698,  0.37586698,  0.37586698, ...,  0.47907889,
        0.37586698,  0.74961442], dtype=float32)

In [ ]:


In [39]:
model = xgb.Booster(model_file='../tfidf/xgbfinal2.model')

In [40]:
df = pd.DataFrame({'f1_score':model.get_fscore().values()}, 
                  index=model.get_fscore().keys())
df = df.sort('f1_score',ascending=True)
df[:10].plot(kind='barh')
plt.xlabel("F1 Score")
plt.ylabel("Features")
plt.title("Feature Importances")


/home/mgupta/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py:3: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
  app.launch_new_instance()
Out[40]:
<matplotlib.text.Text at 0x7f368c098d90>

In [32]:
model.get_fscore().values()


Out[32]:
[3, 2, 1, 1, 2, 2, 2, 1]

In [16]:
xgb.plot_importance(model)


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f368d22ded0>

In [65]:
model = xgb.XGBClassifier()
print model


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [66]:
model.fit(X_train,y_train,eval_metric='auc')


Out[66]:
XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [20]:
model.predict_proba(X_test)


Out[20]:
array([[ 0.65184641,  0.34815356],
       [ 0.74603701,  0.25396299],
       [ 0.32435191,  0.67564809],
       ..., 
       [ 0.71962714,  0.28037286],
       [ 0.08567679,  0.91432321],
       [ 0.01836032,  0.98163968]], dtype=float32)

In [24]:
150/60.0


Out[24]:
2.5

In [26]:
X_train.shape


Out[26]:
(2000, 10)

In [73]:
def load_data(filename):
    '''
    Load data into a data frame for use in running model
    '''
    return pickle.load(open(filename, 'rb'))


def stem_tokens(tokens, stemmer):
    '''Stem the tokens.'''
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed


def OHStokenize(text):
    '''Tokenize & stem. Stems automatically for now.
    Leaving "stemmer" out of function call, so it works with TfidfVectorizer'''
    tokens = word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems

###########################################################################
# tokenization code

def seperatePunct(incomingString):
    outstr = ''
    characters = set(['!','@','#','$',"%","^","&","*",":","\\",
                  "(",")","+","=","?","\'","\"",";","/",
                  "{","}","[","]","<",">","~","`","|"])

    for char in incomingString:
        if char in characters:
            outstr = outstr + ' ' + char + ' '
        else:
            outstr = outstr + char

    return outstr

def hasNumbers(inputString):
     return any(char.isdigit() for char in inputString)

def text_cleaner(wordList):
    '''
    INPUT: List of words to be tokenized
    OUTPUT: List of tokenized words
    '''

    tokenziedList = []

    for word in wordList:

        #remove these substrings from the word
        word = word.replace('[deleted]','')
        word = word.replace('&gt','')

        #if link, replace with linktag
        if 'http' in word:
            tokenziedList.append('LINK_TAG')
            continue

        #if reference to subreddit, replace with reddittag
        if '/r/' in word:
            tokenziedList.append('SUBREDDIT_TAG')
            continue

        #if reference to reddit user, replace with usertag
        if '/u/' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if reference to twitter user, replace with usertag
        if '@' in word:
            tokenziedList.append('USER_TAG')
            continue

        #if number, replace with numtag
        #m8 is a word, 5'10" and 54-59, 56:48 are numbers
        if hasNumbers(word) and not any(char.isalpha() for char in word):
            tokenziedList.append('NUM_TAG')
            continue

        #seperate puncuations and add to tokenizedList
        newwords = seperatePunct(word).split(" ")
        tokenziedList.extend(newwords)

    return tokenziedList

def mytokenize(comment):
    '''
    Input: takes in a reddit comment as a str or unicode and tokenizes it
    Output: a tokenized list
    '''
    tokenizer = PunktSentenceTokenizer()
    sentenceList = tokenizer.tokenize(comment)
    wordList = []
    for sentence in sentenceList:
        wordList.extend(sentence.split(" "))

    return text_cleaner(wordList)

In [74]:
path = '../../data/labeledRedditComments2.p'
cvpath = '../../data/twitter_cross_val.csv'

df = pickle.load(open(path, 'rb'))
dfcv = pd.read_csv(cvpath)

In [75]:
#take a subset of the data for testing this code
randNums = np.random.randint(low=0,high=len(df.index),size=(200,1))
rowList = [int(row) for row in randNums]
dfsmall = df.ix[rowList,:]

In [76]:
nf = dfsmall
X = nf.body
y = nf.label

Xcv = dfcv['tweet_text'].values
ycv = dfcv['label'].values

In [83]:
vect = TfidfVectorizer(stop_words='english', decode_error='ignore',
                           tokenizer=OHStokenize)


# fit & transform comments matrix
tfidf_X = vect.fit_transform(X)
tfidf_Xcv = vect.transform(Xcv)

In [84]:
print tfidf_X.shape
print tfidf_Xcv.shape


(200, 1700)
(10000, 1700)

In [124]:
type(tfidf_X)


Out[124]:
scipy.sparse.csr.csr_matrix

In [129]:
nr, nc = tfidf_Xcvd.shape
newarray = np.ones(shape=(1,nc))
newtfidf_Xcvd = np.vstack((tfidf_Xcvd,newarray))
newsparse = sparse.csr_matrix(newtfidf_Xcvd)

In [135]:
newsparse.shape


Out[135]:
(10001, 1700)

In [136]:
ncv = np.ones(shape=(1, ))
ycvnew = np.concatenate((ycv,ncv))

In [137]:
ycvnew.shape


Out[137]:
(10001,)

In [85]:
tfidf_Xcvd = tfidf_Xcv.todense()

In [108]:
sys.getsizeof(tfidf_Xcv.todense())


Out[108]:
144

In [109]:
sys.getsizeof(tfidf_X)


Out[109]:
64

In [130]:
xg_cv = xgb.DMatrix(tfidf_Xcv, label=ycv)
xg_train = xgb.DMatrix(tfidf_X, label=y)
xg_newsparse = xgb.DMatrix(newsparse , label=y)
xg_cvd = xgb.DMatrix(tfidf_Xcvd, label=ycv)
print xg_train.feature_types


None

In [131]:
print xg_train.num_col()
print xg_cv.num_col()
print xg_cvd.num_col()
print xg_newsparse.num_col()


1700
1696
1700
1700

In [88]:
param = {'max_depth':2,
             'eta':0.5,
             'silent':1,
             'objective':'binary:logistic',
             'eval_metric':'auc'
             }
num_round = 5
watchlist = [(xg_train, 'train'), (xg_cvd, 'eval')]
results = dict()

In [63]:
model = xgb.train(param,
                  xg_train,
                  num_round,
                  watchlist,
                  evals_result=results,
                  maximize=False,
                  verbose_eval=True)


[0]	train-auc:0.557143	eval-auc:0.499306
[1]	train-auc:0.632473	eval-auc:0.494721
[2]	train-auc:0.715275	eval-auc:0.488865
[3]	train-auc:0.74478	eval-auc:0.478342
[4]	train-auc:0.780549	eval-auc:0.5674

In [64]:
results


Out[64]:
{'eval': {'auc': [0.499306, 0.494721, 0.488865, 0.478342, 0.5674]},
 'train': {'auc': [0.557143, 0.632473, 0.715275, 0.74478, 0.780549]}}

In [43]:
paramdict = {'max_depth':[3,6],'learning_rate':[0.1,0.2]}

gs = GridSearchCV(xgb.XGBClassifier,param_grid=paramdict, scoring='auc',n_jobs=-1)

In [46]:
df = pd.DataFrame?

In [49]:
labels = ['max_depth','eta','num_rounds','eval_reslts']
df = pd.DataFrame(columns=labels)

In [56]:
data = [[3,0.1,100,results],[3,0.1,100,results],[3,0.1,100,results]]

In [57]:
d = pd.DataFrame(data=data,columns=labels)

In [58]:
d


Out[58]:
max_depth eta num_rounds eval_reslts
0 3 0.1 100 {u'train': {u'auc': [0.557143, 0.632473, 0.715...
1 3 0.1 100 {u'train': {u'auc': [0.557143, 0.632473, 0.715...
2 3 0.1 100 {u'train': {u'auc': [0.557143, 0.632473, 0.715...

In [89]:
evalSet = zip(tfidf_Xcv,ycv)

In [90]:
model = xgb.XGBClassifier(max_depth = 3,
                          n_estimators = 20,
                          learning_rate = 0.5)

In [92]:
model.fit(tfidf_X,y,eval_set = evalSet, eval_metric = 'auc')
# model.fit(tfidf_X,y)

# results = model.evals_result()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-92-dc5e7c07a2e1> in <module>()
----> 1 model.fit(tfidf_X,y,eval_set = evalSet, eval_metric = 'auc')
      2 # model.fit(tfidf_X,y)
      3 
      4 # results = model.evals_result()

/home/mgupta/anaconda2/lib/python2.7/site-packages/xgboost/sklearn.pyc in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
    422             evals = list(
    423                 DMatrix(x[0], label=self._le.transform(x[1]), missing=self.missing)
--> 424                 for x in eval_set
    425             )
    426             nevals = len(evals)

/home/mgupta/anaconda2/lib/python2.7/site-packages/xgboost/sklearn.pyc in <genexpr>((x,))
    422             evals = list(
    423                 DMatrix(x[0], label=self._le.transform(x[1]), missing=self.missing)
--> 424                 for x in eval_set
    425             )
    426             nevals = len(evals)

/home/mgupta/anaconda2/lib/python2.7/site-packages/sklearn/preprocessing/label.pyc in transform(self, y)
    141         """
    142         check_is_fitted(self, 'classes_')
--> 143         y = column_or_1d(y, warn=True)
    144 
    145         classes = np.unique(y)

/home/mgupta/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.pyc in column_or_1d(y, warn)
    560         return np.ravel(y)
    561 
--> 562     raise ValueError("bad input shape {0}".format(shape))
    563 
    564 

ValueError: bad input shape ()

In [ ]: