In [107]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import xgboost as xgb

import csv
import glob
import json
import os
import sys
import pickle
import re

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

In [108]:
stops_w_pronouns = set(stopwords.words('english')) - set(['i', 'he', 'she', 'you', 
                                       'me', 'him', 'her', 'yours',
                                       'mine', 'his', 'hers', 'ours', 'our', 'your',
                                       'hiself', 'herself', 'myself', 'ourselves', 'yourselves',
                                       'yourself', 'it', 'itself', 'them', 'themselves', 'its',
                                       'themselves', 'their', 'theirselves', 'they'])


//anaconda/envs/ml/lib/python3.5/site-packages/nltk/corpus/reader/wordlist.py:25: ResourceWarning: unclosed file <_io.BufferedReader name='/Users/pvomelveny/nltk_data/corpora/stopwords/english'>
  return concat([self.open(f).read() for f in fileids])

In [123]:
stops_w_pronouns.add('ptsd')

In [127]:
stops = stops_w_pronouns
stops


Out[127]:
{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'did',
 'do',
 'does',
 'doing',
 'don',
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'has',
 'have',
 'having',
 'here',
 'himself',
 'how',
 'if',
 'in',
 'into',
 'is',
 'just',
 'more',
 'most',
 'my',
 'no',
 'nor',
 'not',
 'now',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'out',
 'over',
 'own',
 'ptsd',
 's',
 'same',
 'should',
 'so',
 'some',
 'such',
 't',
 'than',
 'that',
 'the',
 'theirs',
 'then',
 'there',
 'these',
 'this',
 'those',
 'through',
 'to',
 'too',
 'under',
 'until',
 'up',
 'very',
 'was',
 'we',
 'were',
 'what',
 'when',
 'where',
 'which',
 'while',
 'who',
 'whom',
 'why',
 'will',
 'with'}

Use pickeled reddit data from Seb's reddit scraper script


In [128]:
with open('../Reddit/reddit_data.p', 'rb') as f:
    reddit_data = pickle.load(f)

Clean text


In [129]:
def cleaner(text):
    # Replace non-letter characters w/ space
    letters_only = re.sub('[^a-zA-Z]', ' ', text)
    # Lower
    low_string = letters_only.lower()
    # Split
    words = [thing for thing in low_string.split(' ') if thing]
    
    wnl = WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in words]
    
    meaningful_words = [word for word in lemmas if word not in stops_w_pronouns]
    
    return ' '.join(lemmas)

In [130]:
reddit_data['clean'] = reddit_data['text'].apply(cleaner)

Pull out variables


In [131]:
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)

preds = pd.DataFrame(vectorizer.fit_transform(reddit_data['clean']).toarray(), index=reddit_data.index)

labels = LabelEncoder().fit(reddit_data['flag'])
target = pd.Series(labels.transform(reddit_data['flag']), index=reddit_data.index)

XGBoost on our cleaned data


In [132]:
xgtrain = xgb.DMatrix(preds.values, target.values)

xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'auc', 'eta': 0.01,
                  'subsample': 0.75, 'colsample_bytree': 0.68, 'max_depth': 7}

xgb.cv(xgboost_params, xgtrain, num_boost_round=5, nfold=5, metrics={'error'}, seed=0, show_stdv=False)


//anaconda/envs/ml/lib/python3.5/site-packages/xgboost/training.py:272: DeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
Out[132]:
test-auc-mean test-auc-std test-error-mean test-error-std train-auc-mean train-auc-std train-error-mean train-error-std
0 0.943148 0.013296 0.1025 0.011726 0.961346 0.006236 0.071000 0.006727
1 0.961435 0.011535 0.0960 0.011467 0.975456 0.003421 0.067375 0.005265
2 0.972123 0.009468 0.0870 0.012186 0.985033 0.004068 0.057375 0.005440
3 0.975958 0.008453 0.0780 0.016688 0.987691 0.003983 0.049125 0.008068
4 0.978725 0.006595 0.0735 0.016926 0.989843 0.002518 0.047625 0.006828

In [ ]: