In [1]:
import json
import ujson
import datetime
from datetime import timedelta
from pprint import pprint

import json, datetime
def date_hook(json_dict):
    for (key, value) in json_dict.items():
        try:
            json_dict[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
        except:
            pass
    return json_dict

def test_if_hour(tweet):
    if tweet['created_at'].minute == 0:
        return True


import timeit

def convert_date(tweet):
    for (key, value) in tweet.items():
        try:
            tweet[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
        except:
            pass
    return tweet

#    tweet['created_at'] = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
 #   return tweet

In [2]:
import pymongo
tweets = []

client = pymongo.MongoClient()
db = client["tweet_test"]
tweets = db.tweets


import sys, glob, errno

#initial array for the read-in tweets
path = '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/*.json'   
files = glob.glob(path)   
files.sort()
pprint (files)


['/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-001112.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-003035.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-005201.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-010349.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-011547.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-012800.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-014015.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-015445.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-021826.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-024014.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-030408.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-032640.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-034749.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-040918.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-043142.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-045615.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-051736.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-053752.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-055757.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-061712.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-063425.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-065134.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-070742.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-072333.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-073923.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-075452.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-080958.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-082426.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-083800.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-085041.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-090409.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-091713.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-092931.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-094230.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-095605.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-100759.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-101903.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-102959.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-104054.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-105358.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-110457.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-111317.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-112204.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-113214.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-114305.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-115444.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-120548.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-121216.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-121717.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-122219.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-122805.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-123419.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-124156.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-125108.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-130112.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-131218.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-132433.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-133554.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-134742.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-135933.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-141159.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-142358.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-143651.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-145012.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-150250.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-151354.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-152559.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-153711.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-154806.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-160034.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-161313.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-162416.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-163036.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-164139.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-165403.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-170657.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-171909.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-173216.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-174442.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-175734.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-181151.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-182559.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-183918.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-185218.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-190439.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-191716.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-193023.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-194233.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-195542.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-200856.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-202244.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-203421.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-204354.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-205441.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-210819.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-212252.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-213723.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-215445.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-221208.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-223059.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-225013.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-231021.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-233313.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/msftGoogAaplTweets.20141102-235802.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-001102.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-002907.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-004644.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-010300.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-010653.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-012322.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-012556.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-014312.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-014505.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-020158.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-022050.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-023904.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-025818.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-031531.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-033222.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-034958.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-040550.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-042218.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-043824.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-045522.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-051035.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-052551.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-054017.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-055432.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-060736.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-062052.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-063350.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-064629.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-065930.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-071032.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-072113.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-073328.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-074521.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-075728.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-080821.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-081953.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-083120.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-084310.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-085451.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-090522.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-091638.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-092811.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-093952.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-095132.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-100236.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-101357.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-102537.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-103738.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-104934.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-110051.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-111203.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-112401.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-113559.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-113825.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-114850.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-122938.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-225529.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-231239.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-233038.json',
 '/home/agonzales/git/mining-moods-markets/data/mongo/data/db/twtrFbAmznTweets.20141102-234838.json']

In [32]:
import timeit

def convert_date(tweet):
    for (key, value) in tweet.items():
        try:
            tweet[key] = datetime.datetime.strptime(value,'%a %b %d %H:%M:%S +0000 %Y')
        except:
            pass
    return tweet

#    tweet['created_at'] = datetime.datetime.strptime(tweet['created_at'],'%a %b %d %H:%M:%S +0000 %Y')
 #   return tweet


for name in files:
    try:
        for line in open(name): # No need to specify 'r': this is the default.
            #datetweets.append(ujson.loads(line, object_hook=date_hook))
            current_tweet = ujson.loads(line)
            current_tweet = convert_date(current_tweet)
            current_tweet['created_at'] = current_tweet['created_at'] + timedelta(hours = -5)
            #datetweets.append(ujson.loads(line))
            tweets.insert(current_tweet)
    except IOError as exc:
        if exc.errno != errno.EISDIR: # Do not fail if a directory is found, just ignore it.
            raise # Propag

In [3]:
print(tweets.count())
#pprint(tweets.find_one())
start_time = datetime.datetime(2014,11,2,8)
end_time = start_time + timedelta(hours = 1)
print(start_time, end_time)
pprint(tweets.find_one()['created_at'])


3228446
(datetime.datetime(2014, 11, 2, 8, 0), datetime.datetime(2014, 11, 2, 9, 0))
datetime.datetime(2014, 11, 2, 1, 11, 16)

In [4]:
bin = []
bin = [ tweet for tweet in 
           tweets.find({
                "created_at": {
                    "$lt": end_time,
                    "$gte": start_time}
               }) 
      ]

In [5]:
print(len(bin))
print(bin[0]['created_at'])
print(bin[len(bin)-1]['created_at'])
print(bin[len(bin)-1]['created_at'] - bin[0]['created_at'])
#pprint(bin[0])


161010
2014-11-02 08:00:00
2014-11-02 08:59:59
0:59:59

In [47]:
stopwords = ['for', 'if','was','a', 'and', '&amp', 'the', 'of', 'to', 'in', 'i','-','rt']
text = [[ word for word in tweet['text'].lower().split() if word not in stopwords]
                 for tweet in bin]


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-47-595c19494def> in <module>()
      2 text = [[ word for word in tweet['text'].lower().split() if word not in stopwords]
      3                  for tweet in bin]
----> 4 text = [re.sub("#", '', word) for word in text]

/usr/lib/python2.7/re.pyc in sub(pattern, repl, string, count, flags)
    149     a callable, it's passed the match object and must return
    150     a replacement string to be used."""
--> 151     return _compile(pattern, flags).sub(repl, string, count)
    152 
    153 def subn(pattern, repl, string, count=0, flags=0):

TypeError: expected string or buffer

In [15]:
from gensim import corpora
from gensim import models

tempDict = corpora.Dictionary(text)
tempDict.filter_extremes(no_below = 5, )
tempDict.filter_tokens()
tempDict.compactify()
print(tempDict)

tmp_corpus = [tempDict.doc2bow(item) for item in text]
pprint(tmp_corpus[0])


Dictionary(29162 unique tokens: [u'unfollow\u26d4\ufe0f\U0001f6ab', u'woods', u'https://t.co/licyzh4fth', u'#musicnews', u'woody']...)
[(1632, 1),
 (3356, 1),
 (4232, 1),
 (7938, 1),
 (13047, 1),
 (13561, 1),
 (19829, 1),
 (20792, 1),
 (24468, 1),
 (27986, 1)]

In [16]:
lsi = models.LsiModel(tmp_corpus, id2word=tempDict, num_topics=20)

In [17]:
lsi.show_topics()


Out[17]:
[u'0.515*"new" + 0.497*"facebook" + 0.429*"posted" + 0.402*"photo" + 0.167*"on" + 0.144*"you" + 0.103*"is" + 0.088*"video" + 0.080*"@youtube" + 0.073*"my"',
 u'0.402*"you" + 0.304*"@youtube" + 0.304*"on" + 0.262*"is" + 0.247*"video" + 0.209*"from" + -0.204*"posted" + -0.199*"photo" + -0.178*"facebook" + 0.176*"liked"',
 u'-0.569*"@youtube" + -0.434*"video" + -0.351*"liked" + 0.295*"you" + -0.228*"from" + 0.217*"on" + 0.197*"is" + -0.150*"via" + 0.114*"this" + 0.089*"my"',
 u'-0.686*"you" + 0.552*"on" + 0.323*"is" + 0.130*"my" + -0.098*"can" + 0.091*"this" + -0.073*"are" + -0.057*"have" + -0.056*"song" + -0.054*"night"',
 u'-0.643*"is" + 0.637*"on" + -0.192*"this" + -0.184*"apple" + 0.152*"you" + -0.091*"with" + -0.072*"that" + -0.062*"my" + -0.061*"new" + -0.057*"it"',
 u'-0.439*"from" + -0.274*"buy" + -0.274*"song" + 0.267*"via" + -0.253*"last" + 0.247*"@youtube" + -0.214*"please" + 0.211*"you" + -0.199*"us" + -0.191*"help"',
 u'0.640*"apple" + 0.320*"by" + -0.250*"this" + -0.200*"is" + 0.168*"full" + 0.165*"read" + 0.152*"price" + 0.147*"ebay:" + 0.144*"with" + -0.134*"my"',
 u'0.792*"my" + -0.296*"is" + -0.191*"on" + 0.167*"with" + 0.160*"out" + -0.146*"you" + 0.132*"me" + 0.128*"check" + 0.107*"at" + 0.093*"new"',
 u'0.469*"via" + 0.450*"this" + -0.288*"video" + -0.224*"is" + -0.219*"my" + -0.216*"liked" + 0.197*"@youtube" + 0.177*"with" + -0.172*"apple" + -0.161*"you"',
 u'-0.441*"via" + 0.333*"this" + 0.318*"with" + -0.276*"my" + 0.270*"video" + -0.257*"@youtube" + -0.182*"is" + 0.171*"liked" + 0.153*"at" + -0.129*"here"',
 u'-0.571*"with" + 0.541*"this" + 0.232*"apple" + -0.131*"is" + -0.120*"google" + -0.109*"at" + 0.099*"my" + 0.096*"video" + 0.095*"check" + 0.091*"out"',
 u'-0.470*"by" + 0.343*"apple" + -0.245*"new" + 0.183*"facebook" + 0.170*"with" + 0.170*"that" + -0.166*"full" + -0.165*"read" + -0.158*"out" + -0.151*"price"',
 u'-0.403*"new" + 0.275*"facebook" + 0.217*"this" + 0.216*"by" + 0.211*"with" + -0.186*"out" + 0.185*"my" + -0.184*"at" + -0.184*"it" + -0.178*"that"',
 u'-0.383*"that" + -0.258*"facebook" + 0.245*"new" + -0.227*"has" + 0.209*"with" + -0.177*"\u2026" + -0.168*"be" + 0.158*"apple" + -0.132*"by" + -0.131*"it"',
 u'-0.260*"will" + -0.251*"at" + -0.228*"be" + 0.224*"it" + 0.210*"have" + 0.204*"has" + -0.163*"\u2026" + 0.148*"their" + 0.146*"out" + 0.136*"#facebook"',
 u'0.531*"\u2026" + 0.259*"have" + -0.171*"google" + -0.163*"at" + -0.142*"out" + 0.129*"who" + 0.127*"been" + -0.126*"&amp;" + 0.126*"their" + 0.119*"wish"',
 u'0.348*"\u2026" + 0.269*"from" + 0.193*"&amp;" + -0.172*"video" + 0.167*"at" + -0.163*"that" + -0.149*"microsoft" + 0.145*"we" + -0.127*"google" + -0.127*"hevc"',
 u'-0.365*"#youtube" + -0.346*"#news" + -0.338*"#pics" + -0.338*"#pussy" + -0.268*"\u2026" + -0.194*"free" + 0.179*"google" + -0.164*"xnxx:" + 0.153*"at" + -0.130*"|"',
 u'0.245*"has" + -0.208*"the\u2026" + 0.196*"#facebook" + -0.191*"that" + -0.182*"xbox" + -0.178*"now" + -0.165*"says" + 0.165*"social" + -0.161*"microsoft\'s" + -0.160*"#pcgaming"',
 u'0.246*"10" + 0.238*"#mac" + 0.238*"desktop" + 0.237*"multiple" + 0.237*"runs" + 0.237*"operating" + 0.237*"#moreos" + 0.237*"parallels" + 0.236*"systems" + 0.236*"simultaneously"']

In [73]:
lda = models.LdaMulticore(tmp_corpus, id2word=tempDict, num_topics=10)

In [18]:
lsi.show_topics(formatted=False)


Out[18]:
[[(0.51510245284675382, u'new'),
  (0.4974225066455043, u'facebook'),
  (0.428836990288056, u'posted'),
  (0.40210798122326258, u'photo'),
  (0.16650663696683435, u'on'),
  (0.14442410172693115, u'you'),
  (0.10277331231071912, u'is'),
  (0.088192881124256745, u'video'),
  (0.079871511379061191, u'@youtube'),
  (0.072504120930325044, u'my')],
 [(0.40162474815883481, u'you'),
  (0.30424741253789778, u'@youtube'),
  (0.30357237372212509, u'on'),
  (0.26246589953916466, u'is'),
  (0.24733667143774807, u'video'),
  (0.20890763192466125, u'from'),
  (-0.20375966551044364, u'posted'),
  (-0.1991663486678526, u'photo'),
  (-0.17836565116640246, u'facebook'),
  (0.17587565148871553, u'liked')],
 [(-0.56881630595580346, u'@youtube'),
  (-0.43427252678917927, u'video'),
  (-0.35128233864062169, u'liked'),
  (0.29514658768365765, u'you'),
  (-0.2276969347364548, u'from'),
  (0.21744888868829956, u'on'),
  (0.19716857137395594, u'is'),
  (-0.14950129522634334, u'via'),
  (0.11418568194760759, u'this'),
  (0.08873113053010212, u'my')],
 [(-0.68608277024192654, u'you'),
  (0.55181769229746314, u'on'),
  (0.32285350526588547, u'is'),
  (0.13031424675288722, u'my'),
  (-0.098458476180441748, u'can'),
  (0.090551048770889292, u'this'),
  (-0.072894271179691772, u'are'),
  (-0.056635341174143136, u'have'),
  (-0.056217358936952927, u'song'),
  (-0.05445544065507936, u'night')],
 [(-0.64284563248600379, u'is'),
  (0.63651637535905881, u'on'),
  (-0.19221611551626908, u'this'),
  (-0.18437743674064139, u'apple'),
  (0.15235285182882205, u'you'),
  (-0.091456956939931319, u'with'),
  (-0.072051563724572817, u'that'),
  (-0.062067622766949083, u'my'),
  (-0.061169206576917226, u'new'),
  (-0.057012952150248687, u'it')],
 [(-0.43914278799596335, u'from'),
  (-0.27401891011624196, u'buy'),
  (-0.27390687440948286, u'song'),
  (0.26708462420423335, u'via'),
  (-0.25261107619043099, u'last'),
  (0.24651120450102529, u'@youtube'),
  (-0.21436102418793637, u'please'),
  (0.21082252950604349, u'you'),
  (-0.19857978003169632, u'us'),
  (-0.1905935493106185, u'help')],
 [(0.64023232279796516, u'apple'),
  (0.31971156976927001, u'by'),
  (-0.24951974831352608, u'this'),
  (-0.19982352896428218, u'is'),
  (0.16798411735496321, u'full'),
  (0.16459241416853734, u'read'),
  (0.15160875648387004, u'price'),
  (0.14749953062114271, u'ebay:'),
  (0.1439969460009303, u'with'),
  (-0.13414238494215086, u'my')],
 [(0.79242957144277892, u'my'),
  (-0.2955108604489094, u'is'),
  (-0.19142830468655486, u'on'),
  (0.16741413753584777, u'with'),
  (0.16016943855062002, u'out'),
  (-0.14551540606222457, u'you'),
  (0.13198538504396642, u'me'),
  (0.12802719403633542, u'check'),
  (0.10654355663742564, u'at'),
  (0.09295602613265587, u'new')],
 [(0.46854823859763439, u'via'),
  (0.44979940279208308, u'this'),
  (-0.28758082474187774, u'video'),
  (-0.22373757582545151, u'is'),
  (-0.21926527145461294, u'my'),
  (-0.21552583955781074, u'liked'),
  (0.19747631473851429, u'@youtube'),
  (0.17714234642418294, u'with'),
  (-0.17175717862334711, u'apple'),
  (-0.16068334817819838, u'you')],
 [(-0.44064487603586239, u'via'),
  (0.33251464987335272, u'this'),
  (0.31754099973461214, u'with'),
  (-0.27579696072279936, u'my'),
  (0.26992311074762221, u'video'),
  (-0.25702347035514761, u'@youtube'),
  (-0.18247579730085642, u'is'),
  (0.17073234273523222, u'liked'),
  (0.15309625421502987, u'at'),
  (-0.12885859139979272, u'here')],
 [(-0.57074982999554025, u'with'),
  (0.54128847376216049, u'this'),
  (0.23219168066589402, u'apple'),
  (-0.13068163849223616, u'is'),
  (-0.12031954799558588, u'google'),
  (-0.10900021644291544, u'at'),
  (0.098964727134973313, u'my'),
  (0.0958391738180637, u'video'),
  (0.094610975756488025, u'check'),
  (0.091353219375474859, u'out')],
 [(-0.46990909067378051, u'by'),
  (0.34335755365323173, u'apple'),
  (-0.24461671390191095, u'new'),
  (0.18330207127755982, u'facebook'),
  (0.17043115959646879, u'with'),
  (0.17008771810198273, u'that'),
  (-0.16558168086001065, u'full'),
  (-0.16495517576519036, u'read'),
  (-0.15840136956673051, u'out'),
  (-0.15102634394824624, u'price')],
 [(-0.40347467608503096, u'new'),
  (0.27516274901293142, u'facebook'),
  (0.21666846022679043, u'this'),
  (0.21587872097849431, u'by'),
  (0.21105765482657265, u'with'),
  (-0.18606157875111037, u'out'),
  (0.18530476299232285, u'my'),
  (-0.18403554580784875, u'at'),
  (-0.18397254670722701, u'it'),
  (-0.17848486531073623, u'that')],
 [(-0.38261565513472701, u'that'),
  (-0.2581987685560912, u'facebook'),
  (0.24540093061803242, u'new'),
  (-0.22679897109313263, u'has'),
  (0.20895490875184794, u'with'),
  (-0.17726741853498135, u'\u2026'),
  (-0.16809125879355624, u'be'),
  (0.15815691436844553, u'apple'),
  (-0.13234681764178952, u'by'),
  (-0.13055698760930401, u'it')],
 [(-0.26022036203544779, u'will'),
  (-0.25146553448205777, u'at'),
  (-0.22821581313690861, u'be'),
  (0.22385403882975832, u'it'),
  (0.20996457577060079, u'have'),
  (0.20421092707370012, u'has'),
  (-0.16349683881638682, u'\u2026'),
  (0.14833567535881459, u'their'),
  (0.14649164102847867, u'out'),
  (0.13604138677165409, u'#facebook')],
 [(0.53090197476492929, u'\u2026'),
  (0.25870478404648467, u'have'),
  (-0.17141309983406317, u'google'),
  (-0.1629479463716712, u'at'),
  (-0.14193798282603656, u'out'),
  (0.12859662364884736, u'who'),
  (0.1265458715645929, u'been'),
  (-0.12564787524143933, u'&amp;'),
  (0.12550076045744407, u'their'),
  (0.11917440007664482, u'wish')],
 [(0.34789994761811427, u'\u2026'),
  (0.26937805083682598, u'from'),
  (0.19305444376215264, u'&amp;'),
  (-0.17194575232335671, u'video'),
  (0.16666875683714619, u'at'),
  (-0.16314888570991329, u'that'),
  (-0.14947322915916955, u'microsoft'),
  (0.14523515137908161, u'we'),
  (-0.12731621308705401, u'google'),
  (-0.12702352377477169, u'hevc')],
 [(-0.36460077494325149, u'#youtube'),
  (-0.34611760566931521, u'#news'),
  (-0.3376176863174965, u'#pics'),
  (-0.33761679011343737, u'#pussy'),
  (-0.26822495171511795, u'\u2026'),
  (-0.19445545843090326, u'free'),
  (0.1789357171027777, u'google'),
  (-0.1636566099448232, u'xnxx:'),
  (0.15268913553454222, u'at'),
  (-0.12998520509725772, u'|')],
 [(0.24475264371412356, u'has'),
  (-0.2079642512624493, u'the\u2026'),
  (0.19624616871698741, u'#facebook'),
  (-0.1908770482127326, u'that'),
  (-0.18247606635842653, u'xbox'),
  (-0.17797508381839877, u'now'),
  (-0.16469719055092441, u'says'),
  (0.1646090631311167, u'social'),
  (-0.16073215977754249, u"microsoft's"),
  (-0.1601979162006108, u'#pcgaming')],
 [(0.24638987087510036, u'10'),
  (0.23832533780954879, u'#mac'),
  (0.2378925206349827, u'desktop'),
  (0.23699607258243971, u'multiple'),
  (0.23695237974566163, u'runs'),
  (0.23687925492774942, u'operating'),
  (0.2365964619339295, u'#moreos'),
  (0.23657768403163285, u'parallels'),
  (0.23568088095908987, u'systems'),
  (0.23555026008963115, u'simultaneously')]]

In [40]:
import math
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
 
# AFINN-111 is as of June 2011 the most recent version of AFINN
filenameAFINN = '/home/agonzales/git/mining-moods-markets/src/AFINN/AFINN-111.txt'
afinn = dict(map(lambda (w, s): (w, int(s)), [ 
            ws.strip().split('\t') for ws in open(filenameAFINN) ]))
 
# Word splitter pattern
pattern_split = re.compile(r"\W+")
 
def sentiment(text):
    """
    Returns a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative valence. 
    """
    #words = pattern_split.split(text)
    words = text
    #print(words)
    sentiments = map(lambda word: afinn.get(word, 0), words)
    if sentiments:
        # How should you weight the individual word sentiments? 
        # You could do N, sqrt(N) or 1 for example. Here I use sqrt(N)
        sentiment = float(sum(sentiments)/math.sqrt(len((sentiments))))
        
    else:
        sentiment = 0
    return sentiment

In [45]:
topics = lsi.show_topics(formatted=False)

#pprint(topics)
words = []
for topic in topics:
    for item in topic:
        words.append(item[1])
print(words)
for word in words:
    re.sub("#", '', word)
print(words)

In [46]:
sentiment(words)


Out[46]:
0.9192388155425117