Notebook by: Juan Shishido
In this notebook, I'll start cleaning the text columns and, more importantly, thinking about how to classify and group the data within them. Consider using n-grams for word occurence.
In [1]:
import re
import random
#import lda
import csv
import numpy as np
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
df = pd.read_csv('../../data/cleaned/UCB_dept_merge.csv')
In [3]:
df['product_line'] = df['supplier_name'] + ' ' + \
df['product_description'] + ' ' + \
df['manufacturer']
Account for NaN in column name.
In [4]:
random.seed(8675309)
rows = random.sample(df.index, 40000)
df = df.ix[rows].reset_index()
In [5]:
df.head()
Out[5]:
This code:
In a later step, numbers not associated with a percent sign are removed.
In [6]:
cols = ['supplier_name', 'item_type', 'product_description', 'manufacturer',
'buyer__first_name', 'buyer__last_name', 'department_name', 'product_line']
In [7]:
for col in cols:
df[col] = df[col].replace(np.nan, '' , regex=True) \
.apply(lambda x: x.lower()) \
.apply(lambda x: re.sub('(http\S*|www\S*)', '', x)) \
.apply(lambda x: re.sub('((?<=\D)/|/(?=\D))', ' ', x)) \
.apply(lambda x: re.sub('[^A-Za-z0-9.%\/]+', ' ', x)) \
.apply(lambda x: re.sub('\.+', '', x)) \
.apply(lambda x: re.sub('(?<=\s)\w(?=\s)|(?<=\s)\d(?=\s)', '', x)) \
.apply(lambda x: re.sub('\s+', ' ', x).strip())
In [8]:
df.head()
Out[8]:
In [9]:
tokenized_pd = [word_tokenize(line) for line in df.product_line]
Removing English stopwords from NLTK.
In [10]:
stop_words = stopwords.words('english') + \
[u'ea', u'per', u'item', u'description', u'quote', u'pk', u'pack',
'give', 'something', 'inc', 'corporation', 'quantity', 'back',
'products', 'co', 'officemax', 'unit', 'corp']
This code:
In [11]:
tokenized_pd_clean = []
for entry in tokenized_pd:
entry_list = []
for word in entry:
if ((not word in stop_words) and \
(not unicode(word).isnumeric()) and \
(not len(word) <= 1)):
entry_list.append(word)
tokenized_pd_clean.append(entry_list)
In [12]:
df['tokenized_pd_clean'] = tokenized_pd_clean
In [13]:
pd_list_clean = []
for item in tokenized_pd_clean:
pd_list_clean.append(' '.join(item))
In [14]:
vectorizer = CountVectorizer(analyzer = "word",
tokenizer = None,
preprocessor = None,
stop_words = None)
In [15]:
word_features = vectorizer.fit_transform(pd_list_clean).toarray()
In [16]:
word_features.shape
Out[16]:
In [17]:
word_features[0:5,:]
Out[17]:
In [18]:
vocab = vectorizer.get_feature_names()
print vocab[:15]
In [19]:
vocab_map = vectorizer.vocabulary_
In [23]:
X = word_features
In [24]:
model = lda.LDA(n_topics=15, n_iter=1500, random_state=8675309)
model.fit(X)
Out[24]:
In [25]:
topic_word = model.topic_word_
n_top_words = 21
with open('../../results/topic_definitions.csv', 'wb') as to_:
writer = csv.writer(to_, delimiter=',', quotechar='\"')
doc_topic = model.doc_topic_
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
writer.writerow([i, ' '.join(topic_words)])
In [26]:
with open('../../results/pd_topics.csv', 'wb') as to_:
writer = csv.writer(to_, delimiter=',', quotechar='\"')
doc_topic = model.doc_topic_
for i in range(len(tokenized_pd_clean)):
writer.writerow([tokenized_pd_clean[i], doc_topic[i].argmax()])
In [27]:
words = [w.strip().split(' ') for w in pd_list_clean]
word_list = [i for word in words for i in word]
word_counts = Counter(word_list)
top_100_words = word_counts.most_common(100)
for word in top_100_words:
print word
Merge the topic assignments when number of topics is 10.
In [20]:
topics = pd.read_csv('../../results/pd_topics_10.csv', header=None)
In [21]:
topics.columns = ['tpc', 'topic']
In [22]:
df['tpc'] = topics.tpc
In [23]:
df['topic'] = topics.topic
In [24]:
depts = pd.DataFrame({'count' : df.groupby('department_name')['department_name'].count()}).reset_index()
depts.sort('count', ascending=False, inplace=True)
In [25]:
top15 = depts['department_name'][:25].tolist()
In [26]:
df_top15 = df[df.department_name.isin(top15)]
In [27]:
df_top15 = df_top15[df_top15['product_line'] != '']
In [28]:
topics_by_dept = pd.DataFrame({'count' : df_top15.groupby(['department_name', 'topic'])['topic'].count()}).reset_index()
In [29]:
topic_def = pd.read_csv('../../results/topics_definitions_10.csv', header=None)
In [30]:
topic_def.columns = ['topic', 'words']
topic_def['words'] = topic_def['words'].apply(lambda x: ', '.join(x.split()[:10]))
In [31]:
df_top15_final = pd.merge(topics_by_dept, topic_def, on='topic')
In [32]:
df_top15_final.to_csv('../../results/topic_count_10.csv', index=False)
In [ ]: