In [231]:
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.stem.snowball import SnowballStemmer
import re
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from collections import defaultdict
import time
%matplotlib inline

In [132]:
snowball = SnowballStemmer('english')
stopword_set = set(stopwords.words('english'))

In [2]:
df_reviews = pd.read_csv("s3://dogfaces/reviews/reviews.csv")

In [3]:
df_reviews.head()


Out[3]:
rating review_content review_id review_time toy_id toy_name user_name
0 5 My Fur Baby loves his Rope pull toy. I brings ... 185682550 Sep 4, 2017 108574 mammoth-cottonblend-5-knot-dog-rope Gunnerfavorite
1 5 We love this toy. I got the largest size for m... 185591533 Sep 1, 2017 108574 mammoth-cottonblend-5-knot-dog-rope Lindsay
2 4 this was a good product but not for very stron... 185522967 Aug 29, 2017 108574 mammoth-cottonblend-5-knot-dog-rope bcodpas3
3 5 The dogs aren't big chewers so the rope is hol... 185487336 Aug 28, 2017 108574 mammoth-cottonblend-5-knot-dog-rope SickPup
4 5 My Aussies love this rope. It's long enough th... 185484167 Aug 28, 2017 108574 mammoth-cottonblend-5-knot-dog-rope AussieMom

In [4]:
# show a word cloud
stopwords = set(STOPWORDS)
stopwords.add("dog")
stopwords.add("dogs")
stopwords.add("toy")
stopwords.add("love")
stopwords.add("loves")
stopwords.add("toys")
stopwords.add("one")
wc = WordCloud(background_color="white", max_words=2000, stopwords=stopwords)
cloud = wc.generate(' '.join(df_reviews['review_content'].sample(n=1000).values))
plt.figure(figsize=(10,8))
plt.imshow(cloud, interpolation='bilinear')
plt.axis('off')
plt.title('Wordcloud of Chewy Reviews', fontsize=20)
plt.show()


Read dog breed information


In [113]:
# source1: web 
df_breed = pd.read_csv("breed_nick_names.txt",names=['breed_info'])
df_breed.head()


Out[113]:
breed_info
0 1. Labrador Retriever = Lab
1 2. Yorkshire Terrier = Yorkie
2 3. German Shepherd Dog = GShep or GSD
3 4. Golden Retrievers = Golden
4 5. Beagles = Beagle

In [114]:
df_breed.shape


Out[114]:
(160, 1)

In [115]:
breeds_info = df_breed['breed_info'].values
breed_dict = {}
for breed in breeds_info:
    temp = breed.lower()
    temp = re.findall('\d.\s+(\D*)', temp)[0]
    temp = temp.strip().split('=')
    breed_dict[temp[0].strip()] = temp[1].strip()

In [116]:
# 1. different nicek names are separated with 'or'
for k, v in breed_dict.iteritems():
    breed_dict[k] = map(lambda x:x.strip(), v.split(' or '))

In [136]:
# 2. get n-gram and stemmed words breed_dict
for k, v in breed_dict.iteritems():
    breed_dict[k] = set(v)
    breed_dict[k].add(k)
    temp_set = set([snowball.stem(x) for x in breed_dict[k]])
    breed_dict[k] = breed_dict[k]|temp_set
    for word in word_tokenize(k):
        breed_dict[k].add(word)
        breed_dict[k].add(snowball.stem(word))
    breed_dict[k] = breed_dict[k] - {'dog', 'dogs'} - stopword_set

In [193]:
breed_lookup = defaultdict(set)
for k, v in breed_dict.iteritems():
    for word in v:
        breed_lookup[word].add(k)
breed_lookup.keys()
del_list = ['toy','blue','great','duck','coat','wire','st.','white','grey',
            'black','old','smooth','west','soft']
for w in del_list:
    breed_lookup.pop(w, None)
print breed_lookup.keys()
print len(breed_lookup)


['miniature pinschers', 'pointing', u'yorkshir', 'welsh springer', 'lowland', 'manchester terriers', u'chow chow', 'kuyaszok', u'jack russel', u'chesapeak', 'tzus', u'manchest', 'weimaraner', 'bassets', u'cavali', 'shar pei', 'cocker spaniel', u'basenji', 'german', 'polski owczarek', u'newfi', u'english foxhound', 'beauceron', u'crest', u'west highland white terri', 'bichons frises', 'aussie', 'jack', 'english foxhounds', 'anatolian', u'norfolk terri', 'shetland', u'norfolk', 'miniature bull terriers', 'nanny dog', 'australian terrier', u'harrier', u'welsh spr', 'frises', u'polish lowland sheepdog', u'bichons fri', 'japanese', 'english toy spaniels', u'samoy', u'brussels griffon', 'norfolk terriers', 'silky terriers', u'australian cattle dog', u'lhasa', 'bernese', u'toll', u'manchester terri', 'rhodesian', u'bearded colli', 'soft coated wheaten terriers', 'welsh springer spaniels', 'amstaff', 'french', 'water', u'black russian terri', 'german shorthaired pointers', 'elkhounds', 'curly-coated retrievers', 'black and tan coonhounds', u'rottweil', 'komondorok', 'havanese', 'silkies', u'bouvier', u'wirehaired pointing griffon', 'lhasa apsos', u'peke', 'miniature', 'welsh terriers', 'wirehaired pointing griffons', 'wirehaired', 'golden', 'italiani', 'scottish terriers', u'jack russell terri', 'japanese chin', 'irish wolfhounds', u'gordon', 'airedale terriers', 'dachshund', u'irish wolfhound', u'australian terri', 'shetland sheepdogs', 'plotts', u'english cocker spaniel', 'lhasas', 'poodle', 'pit bull', u'english toy spaniel', 'bernese mountain dogs', 'australian', 'belgian', 'anatolian shepherd dogs', 'scotia', 'bullmastiffs', 'brussels griffons', 'crested', u'kerri', 'doxie', 'scottish deerhounds', 'malamutes', u'bernes', 'spaniels field', u'springer', u'retriev', u'portugu', u'malt', u'kerry blue terri', u'ibizan hound', u'schipperk', 'pyrenees', u'portugues', 'newfoundland', u'rhodesian ridgeback', 'dandies', 'old english sheepdogs', 'chows', u'irish water spaniel', u'greater swiss mountain dog', 'pons', u'eski', 'chin', u'havanes', 'tan', u'pinscher', 'lowchen', 'swedish', 'bullmastiff', 'imaal', u'iggi', u'weimaran', 'coonhound', u'pit bul', 'shiba', u'flat-coated retriev', 'pinschers', 'skye terriers', u'chines', u'chow', 'lab', u'pharaoh hound', 'petits', u'affenpinsch', 'flat-coated retrievers', 'maltese', 'petits bassets griffons vendeens', 'eskie', 'rhodesian ridgebacks', 'german pinscher', u'lakeland terri', u'brussel', u'norwegian elkhound', 'tibetan spaniels', u'apso', 'otterhound', u'cattl', 'airedale', 'ridgebacks', 'basenjis', 'wire fox terriers', 'australian shepherd', 'italian greyhounds', u'border terri', u'japan', u'anatolian shepherd dog', u'pointer', 'glen', 'papillon', 'terriers', 'swedish vallhunds', 'sealyham', 'hounds', 'border terrier', 'whippets', 'english springer spaniels', 'greater', 'clumbers', u'english sett', 'salukis', u'afghan hound', u'sealyham terri', 'huskies', u'old english sheepdog', u'welsh springer spaniel', 'smooth fox terriers', 'wheaten', 'australian terriers', u'wire fox terri', u'ridgeback', u'peking', 'pembroke welsh corgi', 'terv', 'retrievers', u'doxi', u'toy fox terri', 'cavalier king charles spaniels', u'cairn', 'french bulldogs', 'keeshonden', 'ibizan hounds', 'tolling', u'spinon', 'briards', u'miniature bull terri', u'gordon sett', 'huskie', u'standard schnauz', 'irish water spaniels', u'silki', u'beagl', 'st. bernards', 'tibetan terriers', u'lhasa apso', 'springers', 'bully', 'sbt', 'norwich', 'pug', 'standard', u'bulli', 'retriever', 'jack russell', 'boston terrier', 'chow chows', 'american', 'borzois', 'sealyham terriers', 'curly-coated', 'german shepherd dog', 'border collies', 'yorkshire terrier', '"cavalier"', 'inu', u'pyren', 'chesapeake bay retrievers', u'airedale terri', 'belgian sheepdogs', 'malinois', u'terrier', 'cardigan welsh corgis', 'finnish spitz', 'chinese shar-pei', 'american pit bull terrier', u'welsh spring', u'beard', 'bulldog', 'plotts hound', 'lakeland terriers', u'minnie bull terri', 'belgian tervuren', u'scottish terri', u'swissi', u'irish terri', u'silky terri', 'kuvaszok', 'schnauzer', 'king', u'curly-co', u'american water spaniel', 'cavalier', 'gshep', 'dane', 'affenpinschers', u'pembrok', u'nova scotia duck tolling retriev', u'soft coated wheaten terri', u'greyhound', 'irish setters', 'dinmont', 'akita', 'norwich terriers', 'american staffordshire terriers', u'miniatur', 'flat-coated', 'border', 'mountain', 'griffons', 'cardigan', 'pekingese', 'doberman', u'skye terri', u'irish sett', u'shelti', u'aired', 'dalmatians', 'american sussex foxhounds', 'pyr', u'pharaoh', 'swiss', u'charl', 'bulldogs', u'shorthair', u'tibetan mastiff', u'chesapeake bay retriev', 'eskimo', u'kerry blu', 'tibetan', u'great pyrene', 'pembroke', u'american staffordshire terri', 'alaskan', 'labrador retriever', 'iggies', 'gordons', 'samoyeds', u'cardigan welsh corgi', 'corgis', 'fox terrier', 'belgian sheepdog', 'st. bernard', 'charles', u'malamut', 'polish lowland sheepdogs', 'manchester', u'scotti', 'sheepdogs', u'german wirehair', u'setter', 'giant', 'coonhounds', u'havan', u'skye', u'bernard', "gsp's", u'bedlington terri', 'spitz', 'chinese', 'shepherd', 'pwds', 'dandie dinmont terriers', u'great pyren', u'clumber spaniel', 'shorthaired', 'komondor', 'beagles', 'pharaohs', u'smooth fox terri', 'parson russell terriers', 'apsos', 'chesapeake', 'borzoi', 'bay', 'corgi', 'pointers', u'belgian malinoi', 'bichons', 'otterhounds', u'pekinges', 'cairns', 'bichon', 'norwegian elkhounds', 'cattle', 'bearded collies', 'foxhounds', u'water dog', 'afghan hounds', u'boston terri', u'american pit bull terri', u'westi', 'irish terriers', u'whippet', u'saluki', u'maltes', 'scotties', u'fox terri', u'bouviers flandr', 'spinoni italiani', 'smooths', u'great dan', 'tervuren', u'smoothi', 'chihuahua', u'canaan dog', 'mastiffs', 'pekes', u'chine', 'nizinny', 'bloodhound', u'dobi', 'staffordshire', u'nizinni', 'spinone', 'boxers', u'american sussex foxhound', 'sheltie', 'setters', 'kerry blues', 'spaniels', u'sheepdog', u'point', u'dandie dinmont terri', u'chinese crest', 'american water spaniels', 'shih', u'pon', 'dobie', 'pom', u'swedish vallhund', u'giant schnauz', 'scottish', 'yorkshire', u'flandr', 'bouviers flandres', 'german pinschers', u'malinoi', 'bull terriers', 'boston', u'frise', u'rhodi', 'toller', 'neapolitan mastiffs', u'dandi', u'neapolitan mastiff', u'aussi', u'spaniel', u'welsh terri', 'min pin', 'bull', 'brussels', 'airedales', u'air', 'ibizan', u'miniature schnauz', 'tzu', 'alaskan malamutes', 'danes', 'neapolitan', 'collies', 'schipperkes', 'glen of imaal terriers', u'staffordshir', 'dandie', u'parson russell terri', 'staffordshire bull terriers', 'russell', u'japanes', u'russel', u'italian greyhound', 'siberian', 'clumber spaniels', u'griffon', u'german shorthaired point', 'vendeens', u'bull terri', u'curly-coated retriev', 'doberman pinscher', u'flat-coat', 'foxhound', 'bearded', 'wires', 'greyhounds', 'welsh', 'collie', 'norwegian', 'rottie', 'afghan', u'gsp', 'highland', 'westie', 'canaan dogs', 'keeshound', 'boxer', 'gsd', 'english', u'tibetan terri', 'spinoni', u'american eskimo dog', 'schnauzers', 'greys', 'skyes', 'deerhound', 'polish', 'smoothies', 'greater swiss mountain dogs', 'vallhunds', 'beagle', 'toy fox terriers', 'english cocker spaniels', u'basset', 'harriers', 'sussex', 'pulik', 'portuguese water dogs', 'wolfhounds', u'miniature pinsch', 'weimies', 'bernards', u'wirehair', u'petits bassets griffons vendeen', 'great pyrenees', u'pyrene', 'chinese crested', 'finnish', 'english setters', u'vallhund', 'giant schnauzers', u'weimi', u'black and tan coonhound', 'jrt', 'siberian huskies', u'french bulldog', 'tibetan spaniel', 'italian', 'portuguese', 'american eskimo dogs', 'miniature schnauzer', 'border terriers', u'bichons fris', 'rottweilers', u'briard', u'english springer spaniel', u'staffi', 'ess', u'vendeen', 'samoyed', 'pharaoh hounds', 'fox', 'newfie', 'staffy', 'black russian terriers', 'water dogs', u'cairn terri', 'coated', 'bedlington terriers', 'pomeranian', 'oes', 'chessie', u'bernese mountain dog', 'weim', 'rhodies', 'jack russell terriers', u'clumber', 'dalmatian', 'kerry blue terriers', u'plott', u'yorki', 'brittany spaniel', 'great danes', 'mastiff', u'oe', 'brittany', u'border colli', u'cavalier king charles spaniel', 'parson', u'brittani', 'minnie bull terriers', 'rotty', 'bouviers', 'belgian malinois', u'yorkshire terri', 'canaan', 'cocker', u'rotti', u'staffordshire bull terri', 'tibetan mastiffs', u'shetland sheepdog', 'nova', 'flandres', 'cairn terriers', u'airedal', 'west highland white terriers', 'golden retrievers', u'siberian huski', 'russian', u'bern', 'shorthairs', 'black russian', 'yorkie', 'basset hound', 'shiba inu', u'glen of imaal terri', u'chessi', 'irish', 'labrador', 'wolfhound', u'petit', 'standard schnauzers', 'norfolks', 'anatolian shepherd', 'shar-pei', 'australian cattle dogs', 'kerry', u'hound', 'pit', 'bedlington', u'poodl', 'manchesters', u'portuguese water dog', u'colli', u'norwich terri', 'field', 'german wirehaired', 'vizsla', u'doberman pinsch', 'gordon setters', 'swissies', 'silky', 'aussies', 'nova scotia duck tolling retrievers', u'german pinsch', 'shih tzu', 'pbgv', u'golden retriev', u'huski', 'lakeland', 'deerhounds', 'elkhound', u'scottish deerhound', 'affen', u'labrador retriev', u'alaskan malamut', 'husky']
617

In [209]:
text_review = df_reviews['review_content'][50439].lower()
print text_review
puncs = string.punctuation
reduced_set = set([snowball.stem(x) for x in (set(filter(lambda x: x not in puncs, word_tokenize(text_review))) 
 - stopword_set)])
po_breeds = []
for w in reduced_set:
    if w in breed_lookup:
        po_breeds.extend(breed_lookup[w])
print po_breeds


we've bought repeated ones of this toy.  our three german shepherds love the squeakers and i love the durability.  great toy for big chewers and pullers. washes well too!
['anatolian shepherd dogs', 'australian shepherd', 'german shepherd dog', 'german pinschers', 'german shepherd dog', 'german shorthaired pointers', 'german wirehaired']

In [213]:
df_reviews.columns


Out[213]:
Index([u'rating', u'review_content', u'review_id', u'review_time', u'toy_id',
       u'toy_name', u'user_name'],
      dtype='object')

In [235]:
def getReviewBreed(text):
    ntext = text.decode('utf-8')
    reduced_set = set([snowball.stem(x) for x in 
                       (set(filter(lambda x: x not in string.punctuation, 
                                   word_tokenize(ntext.lower()))) - stopword_set)])
    po_breeds = []
    for w in reduced_set:
        if w in breed_lookup:
            po_breeds.extend(breed_lookup[w])
    return po_breeds

def getBreedTable(df):
    N = df.shape[0]
    breed = []
    review_id = []
    toy_id = []
    for ind, row in df.iterrows():
        breed.append(getReviewBreed(row['review_content']))
        review_id.append(row['review_id'])
        toy_id.append(row['toy_id'])
    return pd.DataFrame({'review_id':review_id, 'toy_id':toy_id, 'breed_extract':breed})

In [ ]:
test_df = df_reviews.copy()
start_time = time.time()
new_df = getBreedTable(test_df)
print time.time() - start_time

In [237]:
new_df.head()


Out[237]:
breed_extract review_id toy_id
0 [pomeranian] 153701679 46989
1 [] 155032221 46955
2 [] 166467801 44293
3 [] 81797871 39344
4 [german pinschers, german shepherd dog, german... 181363534 55933

In [239]:
df_reviews.shape


Out[239]:
(61202, 7)

In [7]:
# source 2: classified dog names
breed_classes = pd.read_csv("s3://dogfaces/tensor_model/output_labels_20170907.txt",names=['breed'])
breed_classes.head()


Out[7]:
breed
0 rottweiler
1 bull mastiff
2 french bulldog
3 cairn
4 yorkshire terrier

In [ ]:
# design algorithm to identify breed information from review text
def breed_dist(b1, b2):
    br1 = snowball.stem(b1).strip().split()
    br2 = snowball.stem(b2).strip().split()
    return len(set(br1)&set(br2))*1.0/len(set(br1)|set(br2))

In [13]:
snowball = SnowballStemmer('english')

In [ ]: