In [1]:
import os
import cPickle
from scipy.sparse import hstack as sparse_hstack
from scipy.io import savemat
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.core.defchararray import add as stringVecAdd
from SQLConnect import sqlSession

In [5]:
txtFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AmazonProductDataset_MySQL/imported'))[2] if fName.endswith('.txt')]
with sqlSession(True) as cnxn:
    cur=cnxn.cursor()
    for fname in txtFileList:
        print('fetching table {0} ......'.format(fname[:-4]))
        cur.execute("""select count(*) from cs277.{0} where score=1""".format(fname[:-4]))
        score1Cnt=cur.fetchone()[0]
        if score1Cnt>20000:
            score1Cnt=20000
        print(score1Cnt)
        cur.execute("""select score, summary, text from (
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=1 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=2 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=3 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=4 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=5 limit {1})
) as tmp
order by rand()
""".format(fname[:-4],score1Cnt))
        rawDataArr=cur.fetchall()
        rawDataArr=array(rawDataArr)
        save('/home/hencrice/Downloads/AsterixDBClassData/rawData_{0}'.format(fname[:-4]),rawDataArr)
        del rawDataArr


connection established
fetching table Gourmet_Foods ......
13375
fetching table Jewelry ......
4313
fetching table Home_Kitchen ......
20000
fetching table Shoes ......
20000
fetching table Sports_Outdoors ......
20000
fetching table Arts ......
2782
fetching table Industrial_Scientific ......
8684
fetching table Automotive ......
19662
fetching table Health ......
20000
fetching table Cell_Phones_Accessories ......
14360
fetching table Watches ......
5343
fetching table Amazon_Instant_Video ......
20000
fetching table Electronics ......
20000
fetching table Office_Products ......
16658
fetching table Pet_Supplies ......
20000
fetching table Patio ......
20000
fetching table Clothing_Accessories ......
20000
fetching table Software ......
20000
fetching table Tools_Home_Improvement ......
20000
fetching table Toys_Games ......
20000
fetching table Beauty ......
20000
fetching table Kindle_Store ......
12878
fetching table Musical_Instruments ......
7010
fetching table Video_Games ......
20000
fetching table Baby ......
1869
connection closed

In [2]:
rawDataFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData'))[2] if fName.endswith('.npy')]
for fName in rawDataFileList:
    print('Processing {0} ......'.format(fName))
    dataArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fName))
    scores=dataArr[:,0].astype(uint8)
    smryArr=dataArr[:,1]
    txtArr=dataArr[:,2]
    vectorizer=TfidfVectorizer(min_df=9, max_df=0.7, ngram_range=(1, 3))
    savemat('/home/hencrice/Downloads/AsterixDBClassData/processedData/TfIdf_{0}'.format(fName[8:-4]), {'data': vectorizer.fit_transform(stringVecAdd(smryArr, stringVecAdd(" ",txtArr)))})
    save('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fName[8:-4]), scores)
    with open('/home/hencrice/Downloads/AsterixDBClassData/models/vectorizerTfIdf_{0}.pkl'.format(fName[8:-4]),'wb') as fp:
        cPickle.dump(vectorizer, fp, -1)


Processing rawData_Home_Kitchen.npy ......
Processing rawData_Office_Products.npy ......
Processing rawData_Arts.npy ......
Processing rawData_Pet_Supplies.npy ......
Processing rawData_Toys_Games.npy ......
Processing rawData_Beauty.npy ......
Processing rawData_Gourmet_Foods.npy ......
Processing rawData_Software.npy ......
Processing rawData_Shoes.npy ......
Processing rawData_Kindle_Store.npy ......
Processing rawData_Tools_Home_Improvement.npy ......
Processing rawData_Industrial_Scientific.npy ......
Processing rawData_Electronics.npy ......
Processing rawData_Baby.npy ......
Processing rawData_Sports_Outdoors.npy ......
Processing rawData_Cell_Phones_Accessories.npy ......
Processing rawData_Jewelry.npy ......
Processing rawData_Patio.npy ......
Processing rawData_Watches.npy ......
Processing rawData_Video_Games.npy ......
Processing rawData_Clothing_Accessories.npy ......
Processing rawData_Health.npy ......
Processing rawData_Automotive.npy ......
Processing rawData_Musical_Instruments.npy ......