In [1]:
import pandas as pd
import numpy as np
import nltk
import time
import csv
from bs4 import BeautifulSoup
from scipy.sparse import *
from scipy.io import mmwrite, mmread
from ast import literal_eval
import itertools
from itertools import izip
from taggerfunctions import *
from nltk.stem.wordnet import WordNetLemmatizer
In [2]:
uselesssymbols = ['. ','\n',"'",'\"','(',')',',',';',':','?','!','&','$']
def tokenizeWords(entry):
entryselect = []
soup = BeautifulSoup(entry)
for tag in soup.find_all(["pre", "code", "a", "img"]):
tag.decompose()
entry = soup.get_text().encode('ascii', 'ignore')
for symbol in uselesssymbols:
entry = entry.replace(symbol, ' ')
entrytok = nltk.word_tokenize(entry)
entrytok = [w.lower() for w in entrytok]
return tag_pos(entrytok)
In [3]:
def tag_pos(entrytok):
entryselect = []
entrytoktag = braubt_tagger.tag(entrytok)
for tok, tag in entrytoktag:
if tag not in ('VBP', 'CC', 'CD', 'RB', 'TO', 'VB', 'DT', 'IN', 'PRP', 'VBZ', 'WDT', '-NONE-'):
try:
tok_lemmatized = lemmatizer.lemmatize(tok, get_wordnet_pos(tag))
except:
tok_lemmatized = lemmatizer.lemmatize(tok)
entryselect.append(tok_lemmatized)
return entryselect
In [4]:
braubt_tagger = braubt_Tagger()
In [5]:
def getDict(fname):
dictWords = {}
with open(fname, 'r') as f:
reader = csv.reader(f)
dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
return pd.Series(dictWords)
def getSeries(fname, fromRow, toRow):
dictWords = {}
rowNum = 0
with open(fname, 'r') as f:
reader = csv.reader(f)
for row in reader:
if fromRow <= rowNum and rowNum < toRow:
dictWords[literal_eval(row[0])] = literal_eval(row[1])
if rowNum == toRow:
return pd.Series(dictWords)
rowNum += 1
return pd.Series(dictWords)
In [ ]:
reader = pd.read_csv("train.csv", verbose=True, chunksize=100000)
kwlist = []
for chunk in reader:
for entry in chunk['Tags']:
kwlist.extend(entry.split())
kwDist = nltk.FreqDist(kwlist)
In [14]:
kwDist_series = pd.Series(kwDist)
In [16]:
kwDist_series.to_csv("kwDist.csv")
In [17]:
kwDist = {}
with open('kwDist.csv', 'r') as f:
reader = csv.reader(f)
kwDist = {rows[0]:literal_eval(rows[1]) for rows in reader}
kwDist = pd.Series(kwDist)
In [21]:
kwDist.plot(50)
In [ ]:
dictKeys = dict(zip(kwDist.keys(),range(0,len(kwDist))))
In [195]:
dictKeys = pd.Series(dictKeys)
In [108]:
dictKeys.to_csv("dictKeys.csv")
In [5]:
dictKeys = {}
with open('dictKeys.csv', 'r') as f:
reader = csv.reader(f)
dictKeys = {rows[0]:literal_eval(rows[1]) for rows in reader}
dictKeys = pd.Series(dictKeys)
In [6]:
invdictKeys = {}
for key, value in dictKeys.iteritems():
invdictKeys[value] = key
invdictKeys = pd.Series(invdictKeys)
In [7]:
invdictKeys.to_csv("invdictKeys.csv")
In [6]:
distinctWords = dict()
with open("tWordsnew.csv", "r") as f:
reader = csv.reader(f)
for rows in reader:
entrylst = literal_eval(rows[1])
for word in entrylst:
if word not in distinctWords:
distinctWords[word] = 1
else:
distinctWords[word] += 1
In [8]:
distinctWords2 = {key:value for key, value in distinctWords.iteritems() if value > 10}
In [10]:
dWords = pd.Series(distinctWords2)
In [12]:
dictWords = dict(zip(dWords.keys(),range(0,len(dWords))))
In [13]:
dictWords = pd.Series(dictWords)
In [14]:
dictWords.to_csv("dictWordsTitleNew.csv")
In [9]:
dictWords = {}
with open('dictWords.csv', 'r') as f:
reader = csv.reader(f)
dictWords = {rows[0]:literal_eval(rows[1]) for rows in reader}
dictWords = pd.Series(dictWords)
In [15]:
invdictWords = dict(zip(range(0,len(dWords)), dWords.keys()))
invdictWords = pd.Series(invdictWords)
In [10]:
invdictWords = {}
for key, value in dictWords.iteritems():
invdictWords[value] = key
invdictWords = pd.Series(invdictWords)
In [16]:
invdictWords.to_csv("invdictWordsNew.csv")
In [70]:
distinctWordsBody = dict()
with open("bWords_0-638582.csv", "r") as f:
reader = csv.reader(f)
for rows in reader:
entrylst = literal_eval(rows[1])
for word in entrylst:
if word not in distinctWordsBody:
distinctWordsBody[word] = 1
else:
distinctWordsBody[word] += 1
In [71]:
distinctWordsBody2 = {key:value for key, value in distinctWordsBody.iteritems() if value > 10}
print len(distinctWordsBody2)
dWordsBody = pd.Series(distinctWordsBody2)
dictWordsBody = dict(zip(dWordsBody.keys(), range(0,len(dWordsBody))))
dictWordsBody = pd.Series(dictWordsBody)
dictWordsBody.to_csv("dictWordsBody_0-638582.csv")
invdictWordsBody = dict(zip(range(0,len(dWordsBody)), dWordsBody.keys()))
invdictWordsBody = pd.Series(dictWordsBody)
invdictWordsBody.to_csv("invdictWordsBody_0-638582.csv")
In [8]:
fRange = ['0-638582', '600000-1200000', '1200000-1800000','1800000-2400000','2400000-3000000','3000000-3600000',
'3600000-4200000','4200000-4800000','4800000-5400000','5400000-6000000','6000000-6034195']
dictWordsBody = dict()
for fileRange in fRange:
fname = 'dictWordsBody_' + fileRange + '.csv'
print fname
with open(fname, "r") as f:
reader = csv.reader(f)
for rows in reader:
if rows[0] not in dictWordsBody:
dictWordsBody[rows[0]] = literal_eval(rows[1])
In [10]:
dictWordsBody = pd.Series(dictWordsBody)
In [11]:
dictWordsBody.to_csv("dictWordsBodyFull.csv")
careful!! you have to pass the elements to a lil_matrix but if you save it to a file it automatically converts it to a coo_matrix!!! -> save and load before continue using the matrix, otherwise the indices will change the next time!!
In [80]:
coocMat_lil = lil_matrix( (len(dictKeys),len(dictWords)) )
In [ ]:
lemmatizer = WordNetLemmatizer()
reader = pd.read_csv("train.csv", chunksize=100000)
count = 1
timeStart = time.time()
for chunk in reader:
for tags,entry in zip(chunk['Tags'],chunk['Title']):
entryselect = tokenizeWords(entry)
gen = (word for word in entryselect if word in dictWords.keys())
splitTags = tags.split()
for word in gen:
for tag in splitTags:
coocMat_lil[dictKeys[tag],dictWords[word]] += 1
if count % 100000 == 0:
print 'entry', count, 'finished'
timeEnd = time.time()
print 'time for 100000 loops:', timeEnd - timeStart
timeStart = time.time()
count += 1
In [82]:
mmwrite("coocMatTitleNew_coo.mtx", coocMat_lil)
In [4]:
coocMat_coo = mmread("coocMat_coo.mtx")
In [5]:
coocMat_csr = coocMat_coo.tocsr()
coocMat_csc = coocMat_coo.tocsc()
In [92]:
coocMat_coo = None
In [7]:
from collections import Counter
In [ ]:
lemmatizer = WordNetLemmatizer()
count = 1
countChunk = 1
reader2 = pd.read_csv("bWords_5400000-6000000.csv", header=None, index_col=0, chunksize=100000)
reader = pd.read_csv("train.csv", chunksize=100000)
fnameDict = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fnameDict)
coocMatBody_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )
timeStart = time.time()
dictforMat = Counter()
for chunk,chunk2 in izip(reader, reader2):
for tags, entry in zip(chunk['Tags'],chunk2[1]):
set_entry = set(literal_eval(entry))
splitTags = tags.split()
iterWords = list(w for w in set_entry if w in dictWordsBody.keys())
for item in itertools.product(*[splitTags, iterWords]):
#coocMatBody_lil[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
dictforMat[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
if count % 100000 == 0:
print("entry {0:d} finished".format(count))
print("time for 100000 loops: {0:.0f}s".format(time.time() - timeStart))
timeStart = time.time()
if count % 600000 == 0:
for key,value in dictforMat.iteritems():
coocMatBody_lil[key] = value
fname = 'coocMatBody_coo_5400000-6000000.mtx'
mmwrite(fname, coocMatBody_lil)
count += 1
countChunk += 1
if countChunk > 6:
break
for last chunk which is smaller than 600000:
In [11]:
lemmatizer = WordNetLemmatizer()
count = 1
countChunk = 1
reader2 = pd.read_csv("bWords_6000000-6034195.csv", header=None, index_col=0, chunksize=100000)
reader = pd.read_csv("train.csv", chunksize=100000)
fnameDict = 'dictWordsBodyFull.csv'
dictWordsBody = getDict(fnameDict)
coocMatBody_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )
timeStart = time.time()
dictforMat = Counter()
for chunk,chunk2 in izip(reader, reader2):
for tags, entry in zip(chunk['Tags'],chunk2[1]):
set_entry = set(literal_eval(entry))
splitTags = tags.split()
iterWords = list(w for w in set_entry if w in dictWordsBody.keys())
for item in itertools.product(*[splitTags, iterWords]):
#coocMatBody_lil[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
dictforMat[dictKeys[item[0]],dictWordsBody[item[1]]] += literal_eval(entry).count(item[1])
if count % 10000 == 0:
print("entry {0:d} finished".format(count))
print("time for 10000 loops: {0:.0f}s".format(time.time() - timeStart))
timeStart = time.time()
if count % 34195 == 0:
for key,value in dictforMat.iteritems():
coocMatBody_lil[key] = value
fname = 'coocMatBody_coo_6000000-6034195.mtx'
mmwrite(fname, coocMatBody_lil)
count += 1
countChunk += 1
if countChunk > 1:
break
In [ ]:
fRange = ['0-600000', '600000-1200000', '1200000-1800000','1800000-2400000','2400000-3000000','3000000-3600000',
'3600000-4200000','4200000-4800000','4800000-5400000','5400000-6000000','6000000-6034195']
dictWordsBody = getDict('dictWordsBodyFull.csv')
dictKeys = getDict('dictKeys.csv')
coocMatBodyFull_lil = lil_matrix( (len(dictKeys),len(dictWordsBody)) )
for r in fRange:
fname = "coocMatBody_coo_" + r + ".mtx"
print("adding matrix {0:s}".format(r))
coocMatBodyPart_coo = mmread(fname)
coocMatBodyPart_csr = coocMatBodyPart_coo.tocsr()
rows,columns = coocMatBodyPart_csr.nonzero()
for row, column in zip(rows,columns):
coocMatBodyFull_lil[row,column] += coocMatBodyPart_csr[row,column]
fname = 'coocMatBodyFull_coo.mtx'
mmwrite(fname, coocMatBodyFull_lil)
In [6]:
coocMatBodyFull_coo = mmread("coocMatBodyFull_coo.mtx")
In [7]:
coocMatBodyFull_csr = coocMatBodyFull_coo.tocsr()
different method: load partial coocMatrices, convert them to csr format and then use overloaded addition
In [27]:
fname = "coocMatBody_coo_6000000-6034195.mtx"
coocMatBodyPart_coo = mmread(fname)
In [28]:
coocMatBodyPart_csr = coocMatBodyPart_coo.tocsr()
In [29]:
coocMatBodyFull_csr = coocMatBodyFull_csr + coocMatBodyPart_csr
In [30]:
mmwrite('coocMatBodyFull2_csr.mtx', coocMatBodyFull_csr)