Why text is bad for you


In [ ]:
import pandas as pd

import re

pd.set_option('max_colwidth', 300)

In [ ]:


In [ ]:
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)

In [ ]:
df['text'] = df.text.str.extract(r'^(.*?)\.', expand=False)

In [ ]:
df.head()

In [ ]:


In [ ]:

So what can we do?


In [ ]:
import nltk
#nltk.download('all')

In [ ]:


In [ ]:
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()

In [ ]:
tokenizer.tokenize('He takes long walks')

In [ ]:


In [ ]:


In [ ]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()

In [ ]:
stemmer.stem('Walks')

In [ ]:


In [ ]:
def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    return filtered_tokens

def tokenize_and_stem(text):
    tokens = tokenize_only(text)
    stems = map(stemmer.stem, tokens)
    return stems

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:

Ok.. but, how does that help me?


In [ ]:
sample = df[df.name.isin(['4chan', '8chan', 'Aerosmith', 'Alabama', 'Texas'])]

In [ ]:
sample

In [ ]:


In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [ ]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, tokenizer=tokenize_and_stem)

In [ ]:
vec_text = tfidf_vectorizer.fit_transform(sample.text)

In [ ]:
vec_text

In [ ]:


In [ ]:


In [ ]:
pd.DataFrame(vec_text.toarray())

In [ ]:
tfidf_vectorizer.get_feature_names()

In [ ]:
tfidf_vectorizer.idf_

In [ ]:


In [ ]:
vec_df = pd.DataFrame(vec_text.toarray(), columns=tfidf_vectorizer.get_feature_names())

In [ ]:
sample.reset_index(drop=True).join(vec_df)

In [ ]:


In [ ]: