Why text is bad for you



In [ ]:

    
import pandas as pd

import re

pd.set_option('max_colwidth', 300)



In [ ]:



In [ ]:

    
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)



In [ ]:

    
df['text'] = df.text.str.extract(r'^(.*?)\.', expand=False)



In [ ]:

    
df.head()



In [ ]:



In [ ]:

So what can we do?



In [ ]:

    
import nltk
#nltk.download('all')



In [ ]:



In [ ]:

    
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()



In [ ]:

    
tokenizer.tokenize('He takes long walks')



In [ ]:



In [ ]:



In [ ]:

    
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()



In [ ]:

    
stemmer.stem('Walks')



In [ ]:



In [ ]:

    
def tokenize_only(text):
    tokens = tokenizer.tokenize(text)
    
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    
    return filtered_tokens

def tokenize_and_stem(text):
    tokens = tokenize_only(text)
    stems = map(stemmer.stem, tokens)
    return stems



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Ok.. but, how does that help me?



In [ ]:

    
sample = df[df.name.isin(['4chan', '8chan', 'Aerosmith', 'Alabama', 'Texas'])]



In [ ]:

    
sample



In [ ]:



In [ ]:

    
from sklearn.feature_extraction.text import TfidfVectorizer



In [ ]:

    
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, tokenizer=tokenize_and_stem)



In [ ]:

    
vec_text = tfidf_vectorizer.fit_transform(sample.text)



In [ ]:

    
vec_text



In [ ]:



In [ ]:



In [ ]:

    
pd.DataFrame(vec_text.toarray())



In [ ]:

    
tfidf_vectorizer.get_feature_names()



In [ ]:

    
tfidf_vectorizer.idf_



In [ ]:



In [ ]:

    
vec_df = pd.DataFrame(vec_text.toarray(), columns=tfidf_vectorizer.get_feature_names())



In [ ]:

    
sample.reset_index(drop=True).join(vec_df)



In [ ]:



In [ ]: