In [124]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import xgboost as xgb
import os
import pickle
import re
import bs4
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
In [125]:
stops_w_pronouns = set(stopwords.words('english')) - set(['i', 'he', 'she', 'you',
'me', 'him', 'her', 'yours',
'mine', 'his', 'hers', 'ours', 'our', 'your',
'hiself', 'herself', 'myself', 'ourselves', 'yourselves',
'yourself', 'it', 'itself', 'them', 'themselves', 'its',
'themselves', 'their', 'theirselves', 'they'])
In [127]:
with open('../Reddit/reddit_data.p', 'rb') as f:
df = pickle.load(f)
In [ ]:
In [145]:
def cleaner(text):
# Replace non-letter characters w/ space
letters_only = re.sub('[^a-zA-Z]', ' ', text)
# Lower
low_string = letters_only.lower()
# Split
words = [thing for thing in low_string.split(' ') if thing]
words = ["personalI" if word == 'i' else word for word in words]
meaningful_words = [word for word in words if word not in stops_w_pronouns]
wnl = WordNetLemmatizer()
lemmas = [wnl.lemmatize(word) for word in meaningful_words]
return ' '.join(lemmas)
In [146]:
df['text'] = df['text'].apply(cleaner)
In [147]:
p_df = df[df.flag == 'PTSD']
n_df = df[df.flag == 'non_PTSD']
In [148]:
def vectorize(text):
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None,
max_features = 5000)
train_data_features = vectorizer.fit_transform(text)
vocab = vectorizer.get_feature_names()
return vocab, train_data_features.toarray()
In [149]:
p_names, p_feat = vectorize(p_df.text)
n_names, n_feat = vectorize(n_df.text)
In [150]:
p_frame = pd.DataFrame(p_feat, columns=p_names)
n_frame = pd.DataFrame(n_feat, columns=n_names)
In [158]:
p_frame.sum().sort_values(ascending=False)[:100]/869
Out[158]:
In [157]:
n_frame.sum().sort_values(ascending=False)[:100]/1000
Out[157]:
In [153]:
p_df.describe()
Out[153]:
In [144]:
n_df.describe()
Out[144]:
In [ ]: