In [107]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import xgboost as xgb
import csv
import glob
import json
import os
import sys
import pickle
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
In [108]:
stops_w_pronouns = set(stopwords.words('english')) - set(['i', 'he', 'she', 'you',
'me', 'him', 'her', 'yours',
'mine', 'his', 'hers', 'ours', 'our', 'your',
'hiself', 'herself', 'myself', 'ourselves', 'yourselves',
'yourself', 'it', 'itself', 'them', 'themselves', 'its',
'themselves', 'their', 'theirselves', 'they'])
In [123]:
stops_w_pronouns.add('ptsd')
In [127]:
stops = stops_w_pronouns
stops
Out[127]:
In [128]:
with open('../Reddit/reddit_data.p', 'rb') as f:
reddit_data = pickle.load(f)
In [129]:
def cleaner(text):
# Replace non-letter characters w/ space
letters_only = re.sub('[^a-zA-Z]', ' ', text)
# Lower
low_string = letters_only.lower()
# Split
words = [thing for thing in low_string.split(' ') if thing]
wnl = WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in words]
meaningful_words = [word for word in lemmas if word not in stops_w_pronouns]
return ' '.join(lemmas)
In [130]:
reddit_data['clean'] = reddit_data['text'].apply(cleaner)
In [131]:
vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)
preds = pd.DataFrame(vectorizer.fit_transform(reddit_data['clean']).toarray(), index=reddit_data.index)
labels = LabelEncoder().fit(reddit_data['flag'])
target = pd.Series(labels.transform(reddit_data['flag']), index=reddit_data.index)
In [132]:
xgtrain = xgb.DMatrix(preds.values, target.values)
xgboost_params = {'objective': 'binary:logistic', 'booster': 'gbtree', 'eval_metric': 'auc', 'eta': 0.01,
'subsample': 0.75, 'colsample_bytree': 0.68, 'max_depth': 7}
xgb.cv(xgboost_params, xgtrain, num_boost_round=5, nfold=5, metrics={'error'}, seed=0, show_stdv=False)
Out[132]:
In [ ]: