In [1]:
import pandas as pd
pd.set_option('display.max_columns',500)
import csv,re,sys,os,glob,json,operator
import collections

In [2]:
class tweets(object):
    def __iter__(self):
        for f in glob.glob('../data/2014-*/*json')[0:10]:
            for line in open(f,'r').read().decode('utf-8').split('\n'):
                tweet=json.loads(line)
                yield tweet['interaction']['content'].replace('\n',' ').lower()

In [3]:
import nltk as nltk
stopWords = nltk.corpus.stopwords.words('english')

In [4]:
stopWords.extend(["rt", "don't", "via", "ht", "mt"])

In [5]:
tweetStream=tweets()
unigramCounter=collections.defaultdict(int)
bigramCounter=collections.defaultdict(int)

for t in tweetStream:
    toks= t.split(' ')
    for t in [t for t in toks if not t in stopWords]:
        unigramCounter[t]+=1

In [6]:
sortedUnigramCounter = sorted(unigramCounter.iteritems(), key=operator.itemgetter(1))

In [7]:
for s in sortedUnigramCounter[-20:]:
    print s[0],s[1]


#environment 616
water 667
new 681
ebola. 684
#climatechange 908
change. 913
warming 1094
#climate 1182
& 1186
environment 1193
everyone: 1194
freak 1199
scientists: 1204
carbon 1232
- 1493
drought 1707
global 1844
change 1949
 3631
climate 4422

In [12]:
class tweets(object):
    def __iter__(self):
        for f in glob.glob('../data/2014-*/*json')[0:10]:
            for line in open(f,'r').read().decode('utf-8').split('\n'):
                tweet=json.loads(line)
                yield tweet['interaction']['content'].replace('\n',' ').lower()

In [20]:
tweets=[]
for file in glob.glob('../data/2014-*/*json')[0:10]:
# Cycle through files
    fileString=open(file,'r').read().decode('utf-8')
    # Read file as one long string and convert to unicode
    fileTweets=[json.loads(line) for line in fileString.split('\n')]
    # Split into lines and load as JSON
    tweets.extend(fileTweets)
    # Add list of tweets from file to global list
print('We have %d tweets' % len(tweets))


We have 16811 tweets

In [32]:
nDocumentsError=0

documents=[]

for tweet in tweets:
  try:
    documents.append([tweet['interaction']['content']].encode('utf-8'))
  except:
    documents.append('NaN')
    nDocumentsError+=1
    
print nDocumentsError


16811

In [22]:
tokens = nltk.word_tokenize(documents)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-19755f9046b4> in <module>()
----> 1 tokens = nltk.word_tokenize(documents)

/home/ubuntu/anaconda/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in word_tokenize(text)
     85     This tokenizer is designed to work on a sentence at a time.
     86     """
---> 87     return _word_tokenize(text)
     88 
     89 

/home/ubuntu/anaconda/lib/python2.7/site-packages/nltk/tokenize/treebank.pyc in tokenize(self, text)
     65     def tokenize(self, text):
     66         #starting quotes
---> 67         text = re.sub(r'^\"', r'``', text)
     68         text = re.sub(r'(``)', r' \1 ', text)
     69         text = re.sub(r'([ (\[{<])"', r'\1 `` ', text)

/home/ubuntu/anaconda/lib/python2.7/re.pyc in sub(pattern, repl, string, count, flags)
    149     a callable, it's passed the match object and must return
    150     a replacement string to be used."""
--> 151     return _compile(pattern, flags).sub(repl, string, count)
    152 
    153 def subn(pattern, repl, string, count=0, flags=0):

TypeError: expected string or buffer

In [25]:
text = nltk.Text(tokens)
text


Out[25]:
<Text: business news - bankrolling energy exports creates rift between fossil fuels and green power: with boundaries ... http://t.co/esctb741ij emma thompson visits arctic to send tony abbott a climate change message - the guardian: the guardianemma thom... http://t.co/x1ucvxmj6z the climate implications of u.s. liquefied natural gas, or lng, exports http://t.co/dlqwiaii82 get a world of #climate &amp; energy news via email every morning! sign up for a free #climatenexus hot news subscription http://t.co/im15bnk2xy rt @newscientist: local people preserve the environment better than governments http://t.co/fejjnvuoua http://t.co/mocz1iykdr california wake-up call: extreme drought will lead 2 migration exit n real estate collapse http://t.co/sspmlshru7 #climate #chaos #yvr #lng want the best deal on 3 big enviro #potato# pod forever- the ultimate potato planter, what a deal! http://t.co/qhmrrnjdba "@9gag: sometimes i feel useless, but then i remember i breathe out carbon dioxide for plants"...>

In [27]:
text.collocations()


Building collocations list
rt @asapscience: a good example of a simple acid + base reaction, in
which carbon dioxide gas fills the balloon. #sixseconds...
https://t.co/yuonvczuan rt @barackobama: retweet if you support
cutting dangerous carbon pollution. http://t.co/h3pefwnpuj; rt
@barackobama: retweet if you support cutting dangerous carbon
pollution. http://t.co/h3pefwnpuj rt @bathmanreturns: newspaper having
extra page for celebrating world environment day. irony,rip.; diamonds
and coal are both made from the same element: carbon. rt @joelosteen:
the droughts in your life are coming to an end; you’re going to see
restoration, increase, healing &amp; new opportunities.; rt @emmkaff:
scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal. rt @brendohare: hey scientist: if global warming is real,
then why is it cold outside? why isn't it hot? where does the sun go
at night? where did i park?; rt @oscare2000: #climate engineering
induced changes in key earth system properties  http://t.co/mug0ftucfg
#geoengineering #science #tech #drought #debate rt @emmkaff:
scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal.; rt @phil_radford: mt: if you're worried about ebola (100%
of scientists say don't), but not climate change (97% say do), time to
reflect. via @pourmecoffee rt @emmkaff: scientists: don't freak out
about ebola. everyone: *panic!*  scientists: freak out about climate
change. everyone: lol! pass me some coal.; rt @michaelarria: there are
americans who think a black teenager reached for a cop's gun, from 35
feet away, but demand further proof for global warming. rt @nycjim:
study concludes manmade global warming has cause 70 percent of recent
global glacier melt. http://t.co/vclwoecukm http://t.co/wwbaxqgtrx; rt
@richardhine: % who believe climate change largely result of human
activity: china: 93% india: 80% japan: 70% australia: 64% us: 54%
@ipsosmori, via @time rt @emmkaff: scientists: don't freak out about
ebola. everyone: *panic!*  scientists: freak out about climate change.
everyone: lol! pass me some coal.; rt @barackobama: retweet if you
support cutting dangerous carbon pollution. http://t.co/h3pefwnpuj
kentut sapi termasuk penyebab utama global warming, karena
mengeluarkan gas panas yang bisa merusak udara; rt @barackobama:
retweet if you support cutting dangerous carbon pollution.
http://t.co/h3pefwnpuj rt @climatereality: news: world’s youth invited
to present to u.n climate summit. @algore challenges youth to be voice
on climate change http://t.co/j78mwqludf; rt @climatereality: news:
world’s youth invited to present to u.n climate summit. @algore
challenges youth to be voice on climate change http://t.co/j78mwqludf
rt @michaelarria: there are americans who think a black teenager
reached for a cop's gun, from 35 feet away, but demand further proof
for global warming.; rt @barackobama: "epa climate rule economically
feasible, study says" more from @thehill: http://t.co/ivjbdxh7ee
#actonclimate rt @barackobama: retweet if you support cutting
dangerous carbon pollution. http://t.co/h3pefwnpuj; rt @emmkaff:
scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal. rt @fredmisntfree76: so scary, but true!! liberals are
worried about global warming, not about psychopaths shown.
http://t.co/war1qq2cmg"; rt @emmkaff: scientists: don't freak out
about ebola. everyone: *panic!*  scientists: freak out about climate
change. everyone: lol! pass me some coal. rt @naturenews: california
has long had wild swings in climate but the current 3-year drought is
more ominous http://t.co/g4jixitv77 http://t.co/8f08ybxspf; rt
@emmkaff: scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal. rt @narendramodi: ladakh's prakash (energy), paryavaran
(environment) &amp; paryatan (tourism) are not only j&amp;k's
strength. the entire nation will benefit from it; rt @barackobama:
watch: because of climate change, the average u.s. wildfire season is
longer and more intense. http://t.co/0hqju8u0ld #actonclimate rt
@emmkaff: scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal.; rt @aquagreensaving: @nbcbayarea here is the solution to
california's drought http://t.co/dp21ujpjpl rt @emmkaff: scientists:
don't freak out about ebola. everyone: *panic!*  scientists: freak out
about climate change. everyone: lol! pass me some coal.; rt
@barackobama: retweet if you support cutting dangerous carbon
pollution. http://t.co/h3pefwnpuj rt @emmkaff: scientists: don't freak
out about ebola. everyone: *panic!*  scientists: freak out about
climate change. everyone: lol! pass me some coal.; rt @emmkaff:
scientists: don't freak out about ebola. everyone: *panic!*
scientists: freak out about climate change. everyone: lol! pass me
some coal. galih &amp; asyam dr sma kesatuan bangsa berhasil meraih
medali emas di international environment sustainability project
olympiad, holand [gnfi]; rt @michaelarria: there are americans who
think a black teenager reached for a cop's gun, from 35 feet away, but
demand further proof for global warming. rt @un: "we need to
understand we can't do business as usual &amp; a little bit on the
green side" – mary robinson http://t.co/nwxlvtydvu #climate2014

In [1]:
from IPython.core.display import HTML
styles = open("../css/custom.css", "r").read()
HTML(styles)


Out[1]:

In [ ]: