In [1]:
import os
import cPickle
from scipy.sparse import hstack as sparse_hstack
from scipy.io import savemat
from sklearn.feature_extraction.text import TfidfVectorizer
from numpy.core.defchararray import add as stringVecAdd
from SQLConnect import sqlSession
In [5]:
txtFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AmazonProductDataset_MySQL/imported'))[2] if fName.endswith('.txt')]
with sqlSession(True) as cnxn:
cur=cnxn.cursor()
for fname in txtFileList:
print('fetching table {0} ......'.format(fname[:-4]))
cur.execute("""select count(*) from cs277.{0} where score=1""".format(fname[:-4]))
score1Cnt=cur.fetchone()[0]
if score1Cnt>20000:
score1Cnt=20000
print(score1Cnt)
cur.execute("""select score, summary, text from (
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=1 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=2 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=3 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=4 limit {1})
union all
(select cast(score as unsigned) as 'score', summary, text from cs277.{0} where score=5 limit {1})
) as tmp
order by rand()
""".format(fname[:-4],score1Cnt))
rawDataArr=cur.fetchall()
rawDataArr=array(rawDataArr)
save('/home/hencrice/Downloads/AsterixDBClassData/rawData_{0}'.format(fname[:-4]),rawDataArr)
del rawDataArr
In [2]:
rawDataFileList=[fName for fName in next(os.walk('/home/hencrice/Downloads/AsterixDBClassData'))[2] if fName.endswith('.npy')]
for fName in rawDataFileList:
print('Processing {0} ......'.format(fName))
dataArr=load('/home/hencrice/Downloads/AsterixDBClassData/{0}'.format(fName))
scores=dataArr[:,0].astype(uint8)
smryArr=dataArr[:,1]
txtArr=dataArr[:,2]
vectorizer=TfidfVectorizer(min_df=9, max_df=0.7, ngram_range=(1, 3))
savemat('/home/hencrice/Downloads/AsterixDBClassData/processedData/TfIdf_{0}'.format(fName[8:-4]), {'data': vectorizer.fit_transform(stringVecAdd(smryArr, stringVecAdd(" ",txtArr)))})
save('/home/hencrice/Downloads/AsterixDBClassData/processedData/score_{0}'.format(fName[8:-4]), scores)
with open('/home/hencrice/Downloads/AsterixDBClassData/models/vectorizerTfIdf_{0}.pkl'.format(fName[8:-4]),'wb') as fp:
cPickle.dump(vectorizer, fp, -1)