notebook.community

Edit and run



In [107]:

    
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import xgboost as xgb

import csv
import glob
import json
import os
import sys
import pickle
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer



In [108]:

    
stops_w_pronouns = set(stopwords.words('english')) - set(['i', 'he', 'she', 'you', 
                                       'me', 'him', 'her', 'yours',
                                       'mine', 'his', 'hers', 'ours', 'our', 'your',
                                       'hiself', 'herself', 'myself', 'ourselves', 'yourselves',
                                       'yourself', 'it', 'itself', 'them', 'themselves', 'its',
                                       'themselves', 'their', 'theirselves', 'they'])









    



//anaconda/envs/ml/lib/python3.5/site-packages/nltk/corpus/reader/wordlist.py:25: ResourceWarning: unclosed file <_io.BufferedReader name='/Users/pvomelveny/nltk_data/corpora/stopwords/english'>
  return concat([self.open(f).read() for f in fileids])



In [123]:

    
stops_w_pronouns.add('ptsd')



In [127]:

    
stops = stops_w_pronouns
stops









    Out[127]:





{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'did',
 'do',
 'does',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'has',
 'have',
 'having',
 'here',
 'himself',
 'how',
 'if',
 'in',
 'into',
 'is',
 'just',
 'more',
 'most',
 'my',
 'no',
 'nor',
 'not',
 'now',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'out',
 'over',
 'own',
 'ptsd',
 's',
 'same',
 'should',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'theirs',
 'then',
 'there',
 'these',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 'very',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with'}

Use pickeled reddit data from Seb's reddit scraper script



In [128]:

    
with open('../Reddit/reddit_data.p', 'rb') as f:
    reddit_data = pickle.load(f)

Clean text



In [129]:

    
def cleaner(text):
    # Replace non-letter characters w/ space
    letters_only = re.sub('[^a-zA-Z]', ' ', text)
    # Lower
    low_string = letters_only.lower()
    # Split
    words = [thing for thing in low_string.split(' ') if thing]
    
    wnl = WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in words]
    
    meaningful_words = [word for word in lemmas if word not in stops_w_pronouns]
    
    return ' '.join(lemmas)



In [130]:

    
reddit_data['clean'] = reddit_data['text'].apply(cleaner)

Pull out variables



In [131]:

    
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)

preds = pd.DataFrame(vectorizer.fit_transform(reddit_data['clean']).toarray(), index=reddit_data.index)

labels = LabelEncoder().fit(reddit_data['flag'])
target = pd.Series(labels.transform(reddit_data['flag']), index=reddit_data.index)

XGBoost on our cleaned data



In [132]:

    
xgtrain = xgb.DMatrix(preds.values, target.values)

xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'auc', 'eta': 0.01,
                  'subsample': 0.75, 'colsample_bytree': 0.68, 'max_depth': 7}

xgb.cv(xgboost_params, xgtrain, num_boost_round=5, nfold=5, metrics={'error'}, seed=0, show_stdv=False)









    



//anaconda/envs/ml/lib/python3.5/site-packages/xgboost/training.py:272: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]






    Out[132]:






  
    
      
      test-auc-mean
      test-auc-std
      test-error-mean
      test-error-std
      train-auc-mean
      train-auc-std
      train-error-mean
      train-error-std
    
  
  
    
      0
      0.943148
      0.013296
      0.1025
      0.011726
      0.961346
      0.006236
      0.071000
      0.006727
    
    
      1
      0.961435
      0.011535
      0.0960
      0.011467
      0.975456
      0.003421
      0.067375
      0.005265
    
    
      2
      0.972123
      0.009468
      0.0870
      0.012186
      0.985033
      0.004068
      0.057375
      0.005440
    
    
      3
      0.975958
      0.008453
      0.0780
      0.016688
      0.987691
      0.003983
      0.049125
      0.008068
    
    
      4
      0.978725
      0.006595
      0.0735
      0.016926
      0.989843
      0.002518
      0.047625
      0.006828



In [ ]:

	test-auc-mean	test-auc-std	test-error-mean	test-error-std	train-auc-mean	train-auc-std	train-error-mean	train-error-std
0	0.943148	0.013296	0.1025	0.011726	0.961346	0.006236	0.071000	0.006727
1	0.961435	0.011535	0.0960	0.011467	0.975456	0.003421	0.067375	0.005265
2	0.972123	0.009468	0.0870	0.012186	0.985033	0.004068	0.057375	0.005440
3	0.975958	0.008453	0.0780	0.016688	0.987691	0.003983	0.049125	0.008068
4	0.978725	0.006595	0.0735	0.016926	0.989843	0.002518	0.047625	0.006828