In [ ]:
import pandas as pd
import re
pd.set_option('max_colwidth', 300)
In [ ]:
In [ ]:
df = pd.read_csv('../data/wiki/wiki.csv.gz', encoding='utf8', index_col=None)
In [ ]:
df['text'] = df.text.str.extract(r'^(.*?)\.', expand=False)
In [ ]:
df.head()
In [ ]:
In [ ]:
In [ ]:
import nltk
#nltk.download('all')
In [ ]:
In [ ]:
from nltk.tokenize import SpaceTokenizer
tokenizer = SpaceTokenizer()
In [ ]:
tokenizer.tokenize('He takes long walks')
In [ ]:
In [ ]:
In [ ]:
from nltk.stem.snowball import EnglishStemmer
stemmer = EnglishStemmer()
In [ ]:
stemmer.stem('Walks')
In [ ]:
In [ ]:
def tokenize_only(text):
tokens = tokenizer.tokenize(text)
# filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
return filtered_tokens
def tokenize_and_stem(text):
tokens = tokenize_only(text)
stems = map(stemmer.stem, tokens)
return stems
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
sample = df[df.name.isin(['4chan', '8chan', 'Aerosmith', 'Alabama', 'Texas'])]
In [ ]:
sample
In [ ]:
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [ ]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=2, tokenizer=tokenize_and_stem)
In [ ]:
vec_text = tfidf_vectorizer.fit_transform(sample.text)
In [ ]:
vec_text
In [ ]:
In [ ]:
In [ ]:
pd.DataFrame(vec_text.toarray())
In [ ]:
tfidf_vectorizer.get_feature_names()
In [ ]:
tfidf_vectorizer.idf_
In [ ]:
In [ ]:
vec_df = pd.DataFrame(vec_text.toarray(), columns=tfidf_vectorizer.get_feature_names())
In [ ]:
sample.reset_index(drop=True).join(vec_df)
In [ ]:
In [ ]: