In [124]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
import xgboost as xgb

import os
import pickle
import re
import bs4

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer

In [125]:
stops_w_pronouns = set(stopwords.words('english')) - set(['i', 'he', 'she', 'you', 
                                       'me', 'him', 'her', 'yours',
                                       'mine', 'his', 'hers', 'ours', 'our', 'your',
                                       'hiself', 'herself', 'myself', 'ourselves', 'yourselves',
                                       'yourself', 'it', 'itself', 'them', 'themselves', 'its',
                                       'themselves', 'their', 'theirselves', 'they'])

In [127]:
with open('../Reddit/reddit_data.p', 'rb') as f:
    df = pickle.load(f)

In [ ]:


In [145]:
def cleaner(text):
    # Replace non-letter characters w/ space
    letters_only = re.sub('[^a-zA-Z]', ' ', text)
    # Lower
    low_string = letters_only.lower()
    # Split
    words = [thing for thing in low_string.split(' ') if thing]
    words = ["personalI" if word == 'i' else word for word in words]
    
    meaningful_words = [word for word in words if word not in stops_w_pronouns]
    
    wnl = WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in meaningful_words]
    
    return ' '.join(lemmas)

In [146]:
df['text'] = df['text'].apply(cleaner)

In [147]:
p_df = df[df.flag == 'PTSD']
n_df = df[df.flag == 'non_PTSD']

In [148]:
def vectorize(text):    
    vectorizer = CountVectorizer(analyzer = "word",
                                 tokenizer = None,
                                 preprocessor = None,
                                 stop_words = None, 
                                 max_features = 5000) 

    train_data_features = vectorizer.fit_transform(text)
    vocab = vectorizer.get_feature_names()
    return vocab, train_data_features.toarray()

In [149]:
p_names, p_feat = vectorize(p_df.text)
n_names, n_feat = vectorize(n_df.text)

In [150]:
p_frame = pd.DataFrame(p_feat, columns=p_names)
n_frame = pd.DataFrame(n_feat, columns=n_names)

In [158]:
p_frame.sum().sort_values(ascending=False)[:100]/869


Out[158]:
personali     17.446490
it             4.846951
me             3.658228
you            2.073648
like           1.481013
he             1.420023
feel           1.224396
they           1.133487
ptsd           1.077100
time           1.050633
know           1.025316
people         1.011507
get            0.994246
she            0.906789
year           0.866513
ve             0.859609
her            0.848101
thing          0.814730
would          0.773303
one            0.756041
want           0.701956
life           0.682394
really         0.665132
even           0.663982
myself         0.645570
them           0.639816
day            0.590334
him            0.579977
go             0.569620
think          0.542002
                ...    
month          0.298044
our            0.296893
said           0.294591
better         0.293441
started        0.288838
always         0.281933
around         0.279632
come           0.277330
tell           0.272727
person         0.271577
else           0.270426
away           0.269275
talk           0.268124
try            0.266974
re             0.266974
anxiety        0.263521
trying         0.262371
ever           0.258918
long           0.257768
everything     0.252014
last           0.250863
two            0.250863
felt           0.247411
getting        0.243959
since          0.240506
therapy        0.237054
made           0.237054
many           0.235903
little         0.231300
home           0.231300
dtype: float64

In [157]:
n_frame.sum().sort_values(ascending=False)[:100]/1000


Out[157]:
personali    21.164
it            6.569
me            4.054
she           3.330
her           2.984
you           2.580
he            2.125
get           1.545
like          1.490
they          1.273
one           1.179
time          1.151
back          1.091
go            1.070
his           1.041
him           0.954
edit          0.949
got           0.913
would         0.901
day           0.877
know          0.868
didn          0.860
going         0.795
thing         0.788
them          0.745
year          0.735
said          0.706
really        0.667
could         0.659
say           0.654
              ...  
told          0.456
minute        0.456
something     0.456
even          0.455
pretty        0.435
put           0.426
decided       0.422
ago           0.422
today         0.422
call          0.420
much          0.417
two           0.412
also          0.412
myself        0.410
want          0.409
house         0.407
face          0.400
last          0.394
old           0.392
their         0.391
re            0.389
http          0.388
made          0.388
ll            0.387
wasn          0.380
let           0.380
hand          0.375
left          0.372
getting       0.372
wife          0.365
dtype: float64

In [153]:
p_df.describe()


Out[153]:
flag text
count 1000 1000
unique 1 869
top PTSD
freq 1000 132

In [144]:
n_df.describe()


Out[144]:
flag text
count 1000 1000
unique 1 1000
top non_PTSD afternoon i mid fap i heard phone buzz i looke...
freq 1000 1

In [ ]: