In [1]:
brand_cat = []
with open('data/sectors.txt') as data_file:
        for row in data_file:
            data = row.strip().split(',')
            #print(data)
            brand_cat.append((data[1].lower(),data[0].lower()))
brand_cat = brand_cat[1:]

In [ ]:
brand_cat = {}
with open('data/sectors.txt') as data_file:
        for row in data_file:
            data = row.strip().split(',')
            #print(data)
            brand_cat.append((data[1].lower(),data[0].lower()))
brand_cat = brand_cat[1:]

In [2]:
brand_names = []
with open('data/brand_followers_corrected_full.tsv') as data_file:
    for row in data_file:
        data = row.strip().split()
        brand_names.append(data[0].lower())
brand_names.sort()

In [3]:
brands_sect = set([x[0] for x in brand_cat])
brand_data = set(brand_names)

In [4]:
brand_in_progress = []
with open('data/correct_brand_names.txt') as data_file:
        for row in data_file:
            data = row.strip()
            #print(data)
            brand_in_progress.append(data)
brand_in_progress = set(brand_in_progress)

In [8]:
exclude = set(['acne_free', 'acura_insider', 'airwickus', 'ancientminerals', 'bachflowerdfn', 'biothermusa','goldpeaktea'])

In [9]:
len(brands_sect - brand_data - brand_in_progress - exclude)


Out[9]:
81

In [10]:
for b in sorted(list(brands_sect - brand_data - brand_in_progress - exclude)):
    print(b)


americanapparel
cheezit
chefboyardee
chevrolet
chexcereal
chinaglaze
chipsahoy
chobani
chrysler
chuaochocoholic
ciaobellagelato
cisco
claim_jumper
clairolcolor
clarisonic
claussenpickles
cleanandclear
cleanwell
clearasil
clifbar
clorox
coastalscents
cobigelowny
cocacola
coffee_mate
coldeeze
colgate
colorescience
combos
conairbeauty
coola_suncare
corazonas
cosmedix
cosmermaid
cottonelle
covergirl
crabtreeevelyn
crackerbarrel
crackerjackd
cream_of_wheat
cremeofnature
crest
croftersorganic
crushfoods
crystallight
ctcsquares
cuddleclones
cuisinart
curve
customnails
cvsbeautyclub
cwhairlondon
dacorkitchen
daisyhealthrd
danieleinc
dannon
dasaniwater
davesgourmet
davinesofficial
dawndish
ddfskincare
deans_beans
decleorskincare
delallofoods
deliskincare
dell
delmonte
dentek
derma_e
dermadoctor
dermalogica
dermorganic
devotedcreation
dgskincare
dial
diapersdotcom
dietcoke
dietpepsi
johnfriedaus
naturespath
thecoffeebean

In [7]:
brand_data - brands_sect


Out[7]:
set()

In [50]:
import numpy as np
from datetime import datetime
s = datetime.now()
for i in range(8000):
    x = np.arange(500)
print('elapsed', datetime.now()-s)


elapsed 0:00:00.070046

In [ ]:


In [ ]: