Scraped Tweet Manipulation , Analysis | stream_messi.json

Gathering Beautiful Insights



In [43]:

    
#lets seperate the fields of scraped tweets
#understanding stream api extracted json tweets
#id_str,text,created_at,user[screen_name,name],entites[user_mentions,hashtags,urls,extended_urls],geo[coordinates],place[full_name,place_type]
#seperating based on the fields above and converting to CSV
import csv
import json
tweets=[]
for line in open('stream_messi.json'):
    tweets.append(json.loads(line))
    #tweet=json.loads(line)
    #print(json.dumps(tweet, indent=4))
tweet=tweets[0]
#a=tweet.keys()
#print(a)
ids = [tweet['id_str'] for tweet in tweets]
texts = [tweet['text'] for tweet in tweets]
times = [tweet['created_at'] for tweet in tweets]
#print(tweet['user'].keys())
screen_names = [tweet['user']['screen_name'] for tweet in tweets]
names = [tweet['user']['name'] for tweet in tweets]
#print tweet['entities']
#for tweet in tweets:
  #if tweet['entities']['user_mentions']:
    #print tweet['entities']['user_mentions']
mentions1 = [(T['entities']['user_mentions'][0]['screen_name'] if len(T['entities']['user_mentions']) >= 1 else None) for T in tweets]
mentions2 = [(T['entities']['user_mentions'][1]['screen_name'] if len(T['entities']['user_mentions']) >= 2 else None) for T in tweets]
hashtags1 = [(T['entities']['hashtags'][0]['text'] if len(T['entities']['hashtags']) >= 1 else None) for T in tweets]
hashtags2 = [(T['entities']['hashtags'][1]['text'] if len(T['entities']['hashtags']) >= 2 else None) for T in tweets]
urls1 = [(T['entities']['urls'][0]['expanded_url'] if len(T['entities']['urls']) >= 1 else None) for T in tweets]
urls2 = [(T['entities']['urls'][1]['expanded_url'] if len(T['entities']['urls']) >= 2 else None) for T in tweets]
#print tweet['geo']
lats = [(T['geo']['coordinates'][0] if T['geo'] else None) for T in tweets]
lons = [(T['geo']['coordinates'][1] if T['geo'] else None) for T in tweets]
#print tweet['place'].keys()
place_names = [(T['place']['full_name'] if T['place'] else None) for T in tweets]
place_types = [(T['place']['place_type'] if T['place'] else None) for T in tweets]
out = open('messi_tweets.csv', 'w')
print >> out, 'id,created,text,screen name,name,mention 1,mention 2,hashtag 1,hashtag 2,url 1,url 2,lat,lon,place name,place type'
rows = zip(ids, times, texts, screen_names, names, mentions1, mentions2, hashtags1, hashtags2, urls1, urls2, lats, lons, place_names, place_types)

from csv import writer
csv = writer(out)

for row in rows:
    values = [(value.encode('utf8') if hasattr(value, 'encode') else value) for value in row]
    csv.writerow(values)

out.close()



In [50]:

    
for T in tweets:
    apple= T['text'][1]
print(apple)



In [100]:

    
#natural language toolkit
#tokenization
import nltk
from nltk.tokenize import word_tokenize
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(word_tokenize(tweet))









    



['RT', '@', 'marcobonzanini', ':', 'just', 'an', 'example', '!', ':', 'D', 'http', ':', '//example.com', '#', 'NLP']



In [12]:

    
#word_tokenize vs tokenize
import nltk
from nltk.tokenize import word_tokenize
tweet = 'RT @marcobonzanini: just an example! :D http://example.com #NLP'
print(tokenize(tweet))









    



['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']



In [92]:

    
#regular expression 
#preparing preprocess function
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
tweet = "RT @marcobonzanini: just an example! :D http://example.com #NLP"
print(preprocess(tweet))
# ['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']









    



['RT', '@marcobonzanini', ':', 'just', 'an', 'example', '!', ':D', 'http://example.com', '#NLP']



In [52]:

    
import json
arr=[]
with open('stream_cr7.json', 'r') as f:
    lines=f.readlines()
    for line in lines[0:]:
        tweet=json.loads(line)
        texts=tweet['text'].split(', ')
        #token=preprocess(tweet['text'])
        arr.append(texts)
        #data=json.dumps(token, indent=4)
        #print(data)
arr









    Out[52]:





[[u'RT @TurXy: \u0627\u062e\u0630 \u062d\u0642\u0647 \u0628\u062f\u0642 \u062e\u0634\u0648\u0648\u0645  .. #CR7 https://t.co/uq4JL7n9dm'],
 [u'Ronaldo has earned 8,925,504 euros so far this year. #CR7 #HalaMadrid #RealMadrid #Cristiano'],
 [u'RT @_tmypl: Mais la France est arriv\xe9e jusque l\xe0 au moins \U0001f917\U0001f595\U0001f3fb https://t.co/px2TXuEGlF'],
 [u'RT @Gaame_Ooveer: #BonsoirSauf \xe0 ceux qui croyait que Cr7 allait rien remporter avec le Portugal \U0001f605'],
 [u'@cargax https://t.co/9WXYRMWkXb'],
 [u'Como CR7 #diablosrojosmx #fraynano #cdmx https://t.co/PuQI1c1fmc'],
 [u'RT @Frsprv_: Ceux qui ose comparer CR7 \xe0 Griezman je pose sa la \u270b\U0001f3fc\U0001f60a #TeamCR7 #TeamPortugal \U0001f1f5\U0001f1f9 https://t.co/p7Va4QNjAL'],
 [u'RT @Footballogue: [#D\xe9cla\U0001f4ac] CR7 : "C\u2019est un des moments les plus heureux de ma vie',
  u'au niveau de ma carri\xe8re de joueur professionnel" https\u2026'],
 [u'CR7 \U0001f1f5\U0001f1f9 https://t.co/GgAacw0LYJ']]



In [96]:

    
#removing stopwords out of tweets
from nltk.corpus import stopwords
import string
punctuation=list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['RT', 'via','de','el','que','u2026','xe9','#EURO2016','#Euro2016','India','india','#']



In [94]:

    
def byteify(input):
    if isinstance(input, dict):
        return {byteify(key): byteify(value)
                for key, value in input.iteritems()}
    elif isinstance(input, list):
        return [byteify(element) for element in input]
    elif isinstance(input, unicode):
        return input.encode('utf-8')
    else:
        return input



In [21]:

    
#counting no of hastags andfinding most common
# coding=utf-8
import operator 
import json
import yaml
from collections import Counter
 
fname = 'stream_india.json'
with open(fname, 'r') as f:
    count_all = Counter()
    for line in f:
        tweet = json.loads(line)
        # Create a list with all the terms
        twt=tweet['text']
        terms_all = [term for term in preprocess(twt) if term not in stop]
        # Count terms only once, equivalent to Document Frequency
        terms_single = set(terms_all)
        # Count hashtags only
        terms_hash = [term for term in preprocess(tweet['text']) if term.startswith('#')]
        # Count terms only (no hashtags, no mentions)
        terms_only = [term for term in preprocess(tweet['text']) if term not in stop and not term.startswith(('#', '@'))] 
              # mind the ((double brackets))
              # startswith() takes a tuple (not a list) if 
              # we pass a list of inputs
        # Update the counter
        count_all.update(terms_hash)
    # Print the first 5 most frequent words
    print(count_all.most_common(10))









    



[(u'#ModiForeignAchievements', 193), (u'#India', 178), (u'#', 74), (u'#Kashmir', 57), (u'#ModiFore', 51), (u'#india', 40), (u'#WhyBjpAgainstDalit', 40), (u'#ProZakirLeague', 40), (u'#VijayFanGirls_Rockz', 39), (u'#KashmirNow', 28)]



In [8]:

    
import json
import csv
data_json = open('stream_nepal.json', mode='r').read()
data_python = json.loads(data_json)

csv_out = open('tweets_out_ASCII.csv', mode='w') #opens csv file
writer = csv.writer(csv_out) #create the csv writer object
 
fields = ['created_at', 'text', 'screen_name', 'followers', 'friends', 'rt', 'fav'] #field names
writer.writerow(fields) #writes field
Python

for line in data_python:

    #writes a row and gets the fields from the json object
    #screen_name and followers/friends are found on the second level hence two get methods
    writer.writerow([line.get('created_at'),
                     line.get('text').encode('unicode_escape'), #unicode escape to fix emoji issue
                     line.get('user').get('screen_name'),
                     line.get('user').get('followers_count'),
                     line.get('user').get('friends_count'),
                     line.get('retweet_count'),
                     line.get('favorite_count')])

csv_out.close()









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-8-050ccef5056e> in <module>()
      2 import csv
      3 data_json = open('stream_nepal.json', mode='r').read()
----> 4 data_python = json.loads(data_json)
      5 
      6 csv_out = open('tweets_out_ASCII.csv', mode='w') #opens csv file

C:\Users\Vamps\Anaconda2\lib\json\__init__.pyc in loads(s, encoding, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)
    337             parse_int is None and parse_float is None and
    338             parse_constant is None and object_pairs_hook is None and not kw):
--> 339         return _default_decoder.decode(s)
    340     if cls is None:
    341         cls = JSONDecoder

C:\Users\Vamps\Anaconda2\lib\json\decoder.pyc in decode(self, s, _w)
    365         end = _w(s, end).end()
    366         if end != len(s):
--> 367             raise ValueError(errmsg("Extra data", s, end, len(s)))
    368         return obj
    369 

ValueError: Extra data: line 2 column 1 - line 73 column 1 (char 9712 - 309326)



In [147]:

    
import json
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

tweets_data_path = 'twitter_nepal.txt'
tweets_data=[]
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue 
   

print len(tweets_data)



In [149]:

    
#Data Frame Yayyyyyyyyyyyyy!
tweets=pd.DataFrame()



In [150]:

    
#set three columns text,lang,country
#count tweets by lang
tweets['text']=map(lambda tweet:tweet['text'],tweets_data)
tweets['lang']=map(lambda tweet:tweet['lang'],tweets_data)
tweets['country'] = map(lambda tweet: tweet['place']['country'] if tweet['place'] != None else None, tweets_data)
tweets_by_lang = tweets['lang'].value_counts()
tweets_by_lang









    Out[150]:





en    37
ne    20
es     5
it     4
tr     2
pt     1
in     1
Name: lang, dtype: int64



In [151]:

    
#matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.tick_params(axis='x', labelsize=15)
ax.tick_params(axis='y', labelsize=10)
ax.set_xlabel('Languages', fontsize=15)
ax.set_ylabel('Number of tweets' , fontsize=15)
ax.set_title('Top 5 languages', fontsize=15, fontweight='bold')
tweets_by_lang[:6].plot(ax=ax, kind='bar', color='red')









    Out[151]:





<matplotlib.axes._subplots.AxesSubplot at 0xc9be6d8>



In [152]:

    
tweets_by_country = tweets['country'].value_counts()
tweets_by_country









    Out[152]:





Nepal     1
Brazil    1
Name: country, dtype: int64



In [154]:

    
i=0
for nep in tweets['text'][i]:
    nep=tokenize(nep)
    i=i+1



In [156]:

    
tweets









    Out[156]:






  
    
      
      text
      lang
      country
    
  
  
    
      0
      @ThatDudeDhiraj i hope you're having a brillia...
      en
      None
    
    
      1
      @wiuzz Bali, Laos, Nepal, Jamaica, África do S...
      pt
      Brazil
    
    
      2
      RT @RinconDSegunda: El REAL OVIEDO traspasa fr...
      es
      None
    
    
      3
      Ho appena letto una storia terrificante di una...
      it
      None
    
    
      4
      "This trip pushed me far out of my comfort zon...
      en
      None
    
    
      5
      RT @shtl1980: 3. India will build biggest ever...
      en
      None
    
    
      6
      RT @elmundoes: El Padre Alfaro lleva 15 años e...
      es
      None
    
    
      7
      Review: Executive Lounge, Kathmandu KTM #avgee...
      en
      None
    
    
      8
      RT @DrGPradhan: Nepal, Orisa &amp; UP too have...
      en
      None
    
    
      9
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      10
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      11
      RT @SomeecardES: Al menos con la notificación ...
      es
      None
    
    
      12
      @LimaDelta7 @ChrisPen2530 poland, nepal (rando...
      en
      None
    
    
      13
      RT @CorriereSociale: #BreakingNews #Bambini ne...
      it
      None
    
    
      14
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      15
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      16
      ७६५४२९ : सात लाख पैंसट्ठी हजार चारसय उनन्तिस #...
      ne
      None
    
    
      17
      ७६५४३० : सात लाख पैंसट्ठी हजार चारसय तिस #Budd...
      ne
      None
    
    
      18
      ७६५४३१ : सात लाख पैंसट्ठी हजार चारसय एकत्तिस #...
      ne
      None
    
    
      19
      ७६५४३२ : सात लाख पैंसट्ठी हजार चारसय बत्तिस #B...
      ne
      None
    
    
      20
      ७६५४३३ : सात लाख पैंसट्ठी हजार चारसय तेत्तिस #...
      ne
      None
    
    
      21
      RT @CorriereSociale: #BreakingNews #Bambini ne...
      it
      None
    
    
      22
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      23
      RT @amightygirl: How @maggiedoyne's year in Ne...
      en
      None
    
    
      24
      ७६५४३४ : सात लाख पैंसट्ठी हजार चारसय चौँतिस #B...
      ne
      None
    
    
      25
      ७६५४३५ : सात लाख पैंसट्ठी हजार चारसय पैँतिस #B...
      ne
      None
    
    
      26
      ७६५४३६ : सात लाख पैंसट्ठी हजार चारसय छत्तिस #B...
      ne
      None
    
    
      27
      ७६५४३७ : सात लाख पैंसट्ठी हजार चारसय सैँतीस #B...
      ne
      None
    
    
      28
      ७६५४३८ : सात लाख पैंसट्ठी हजार चारसय अठतीस #Bu...
      ne
      None
    
    
      29
      ७६५४३९ : सात लाख पैंसट्ठी हजार चारसय उनन्चालीस...
      ne
      None
    
    
      ...
      ...
      ...
      ...
    
    
      40
      RT @projectsabroad: "This trip pushed me far o...
      en
      None
    
    
      41
      अहिले नेपालको समय अनुसार July 14, 2016 at 01:0...
      ne
      None
    
    
      42
      RT @IWMI_: Silent springs: saving the vanishin...
      en
      None
    
    
      43
      RT @4FreedominIran: #VIDEO Speech of Bandana R...
      en
      None
    
    
      44
      RT @Myth_busterz: #ModiForeignAchievements\n@n...
      en
      None
    
    
      45
      RT @elmundoes: El Padre Alfaro lleva 15 años e...
      es
      None
    
    
      46
      CCTV, the seconds of the earthquake in nepal. ...
      en
      None
    
    
      47
      RT @pa1yandigeri: The Yemen operations,nepal r...
      en
      None
    
    
      48
      Fıldır fıldır gözlerle herkesi baştan aşağı sü...
      tr
      None
    
    
      49
      RT @Ask_Saffron: India will build Biggest ever...
      en
      None
    
    
      50
      नेपाली मोडल रेश्मा थापा #Reshma #Thapa \n#Nepa...
      ne
      None
    
    
      51
      RT @AnupKaphle: Nepal’s 1st female chief justi...
      en
      None
    
    
      52
      Nepal's Bindu Pariyar Bi-Sexual three-Way http...
      en
      None
    
    
      53
      RT @Rahulrautwrites: And now #Sultan creates H...
      en
      None
    
    
      54
      I AM IMMEDIATELY MOVING TO NEPAL WHERE I SHALL...
      en
      None
    
    
      55
      RT @seywarddarby: Latest cover story for @Fore...
      en
      None
    
    
      56
      ‘India toppled Oli-led government to spoil Chi...
      en
      None
    
    
      57
      Yüzlerinden hıyanetlik okunan Ülkeler 😂😂😂\n...
      tr
      None
    
    
      58
      The greatest story never told: Davies, while n...
      en
      Nepal
    
    
      59
      CCTV, the seconds of the earthquake in nepal. ...
      en
      None
    
    
      60
      भानुभक्त कस्ता कवि हुन् ? ‘आदिकवि’ वा ‘अप्रगति...
      ne
      None
    
    
      61
      RT @FaktaGoogle: Sebagian mata uang kertas rup...
      in
      None
    
    
      62
      @Keventersshake 5 Bottles #WinHampers #Kevente...
      en
      None
    
    
      63
      RT @Keventersshake: Spot hidden Keventers shak...
      en
      None
    
    
      64
      Brighton Bracelet Nepal Bangle Black Acrylic T...
      en
      None
    
    
      65
      RT @OpenGovHub: TONIGHT! Don't miss @AccountLa...
      en
      None
    
    
      66
      RT @OpenGovHub: TONIGHT! Don't miss @AccountLa...
      en
      None
    
    
      67
      news:Maoists, opposition join forces to try to...
      en
      None
    
    
      68
      Nepal Appoints First Woman Chief Justice https...
      en
      None
    
    
      69
      [🍷CULTURA DEL #VINO] #Nepal aprende sobre #vi...
      es
      None
    
  

70 rows × 3 columns



In [47]:

    
import re
def word_in_text(word, text):
    word = word.lower()
    text = text.lower()
    match = re.search(word, text)
    if match:
        return True
    return False



In [48]:

    
tweets['python'] = tweets['text'].apply(lambda tweet: word_in_text('python', tweet))
tweets['javascript'] = tweets['text'].apply(lambda tweet: word_in_text('javascript', tweet))
tweets['ruby'] = tweets['text'].apply(lambda tweet: word_in_text('ruby', tweet))



In [67]:

    
print tweets['python'].value_counts()[True]
print tweets['javascript'].value_counts()[True]
print tweets['ruby'].value_counts()[True]



In [51]:

    
prg_langs = ['python', 'javascript', 'ruby']
tweets_by_prg_lang = [tweets['python'].value_counts()[True], tweets['javascript'].value_counts()[True],
                      tweets['ruby'].value_counts()[True]]

x_pos = list(range(len(prg_langs)))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width, alpha=1, color='g')

# Setting axis labels and ticks
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Ranking: python vs. javascript vs. ruby (Raw data)', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0.4 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()



In [144]:

    
#tweets['programming'] = tweets['text'].apply(lambda tweet: word_in_text('programming', tweet))
tweets['nepal'] = tweets['text'].apply(lambda tweet: word_in_text('nepal', tweet))
#tweets['relevant'] = tweets['text'].apply(lambda tweet: word_in_text('usa', tweet) or word_in_text('uk', tweet))



In [139]:

    
#print tweets['programming'].value_counts()[True]
#print tweets['tutorial'].value_counts()[False]
print tweets['relevant'].value_counts()[True]



In [140]:

    
#print tweets[tweets['relevant'] == True]['python'].value_counts()[False]
#print tweets[tweets['relevant'] == True]['javascript'].value_counts()[True]
print tweets[tweets['relevant'] == True]['ruby'].value_counts()[False]









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-140-a2ccf80b6c76> in <module>()
      1 #print tweets[tweets['relevant'] == True]['python'].value_counts()[False]
      2 #print tweets[tweets['relevant'] == True]['javascript'].value_counts()[True]
----> 3 print tweets[tweets['relevant'] == True]['usa'].value_counts()[true]

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\indexes\base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()

KeyError: 'usa'



In [141]:

    
tweets_by_prg_lang = [tweets[tweets['relevant'] == True]['python'].value_counts()[True] 
                     # tweets[tweets['relevant'] == True]['javascript'].value_counts()[True], 
                      #tweets[tweets['relevant'] == True]['ruby'].value_counts()[False]
                     ]
x_pos = list(range(len(prg_langs)))
width = 0.8
fig, ax = plt.subplots()
plt.bar(x_pos, tweets_by_prg_lang, width,alpha=1,color='g')
ax.set_ylabel('Number of tweets', fontsize=15)
ax.set_title('Ranking: python vs. javascript vs. ruby (Relevant data)', fontsize=10, fontweight='bold')
ax.set_xticks([p + 0.4 * width for p in x_pos])
ax.set_xticklabels(prg_langs)
plt.grid()









    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-141-9802a320dfaf> in <module>()
----> 1 tweets_by_prg_lang = [tweets[tweets['relevant'] == True]['usa'].value_counts()[True] 
      2                      # tweets[tweets['relevant'] == True]['javascript'].value_counts()[True],
      3                       #tweets[tweets['relevant'] == True]['ruby'].value_counts()[False]
      4                      ]
      5 x_pos = list(range(len(prg_langs)))

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\frame.pyc in __getitem__(self, key)
   1995             return self._getitem_multilevel(key)
   1996         else:
-> 1997             return self._getitem_column(key)
   1998 
   1999     def _getitem_column(self, key):

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\frame.pyc in _getitem_column(self, key)
   2002         # get column
   2003         if self.columns.is_unique:
-> 2004             return self._get_item_cache(key)
   2005 
   2006         # duplicate columns & possible reduce dimensionality

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\generic.pyc in _get_item_cache(self, item)
   1348         res = cache.get(item)
   1349         if res is None:
-> 1350             values = self._data.get(item)
   1351             res = self._box_item_values(item, values)
   1352             cache[item] = res

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\core\internals.pyc in get(self, item, fastpath)
   3288 
   3289             if not isnull(item):
-> 3290                 loc = self.items.get_loc(item)
   3291             else:
   3292                 indexer = np.arange(len(self.items))[isnull(self.items)]

C:\Users\Vamps\Anaconda2\lib\site-packages\pandas\indexes\base.pyc in get_loc(self, key, method, tolerance)
   1945                 return self._engine.get_loc(key)
   1946             except KeyError:
-> 1947                 return self._engine.get_loc(self._maybe_cast_indexer(key))
   1948 
   1949         indexer = self.get_indexer([key], method=method, tolerance=tolerance)

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4154)()

pandas\index.pyx in pandas.index.IndexEngine.get_loc (pandas\index.c:4018)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12368)()

pandas\hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas\hashtable.c:12322)()

KeyError: 'usa'



In [135]:

    
def extract_link(text):
    regex = r'https?://[^\s<>"]+|www\.[^\s<>"]+'
    match = re.search(regex, text)
    if match:
        return match.group()
    return ''



In [136]:

    
tweets['link'] = tweets['text'].apply(lambda tweet: extract_link(tweet))



In [143]:

    
tweets['text'][6]









    Out[143]:





u'@sanameer786 jab is qaum k Hukmuran india sy mard ban kr baat nhi krskty to hum aam awam "please" krk hi baat krengy!'



In [80]:

    
tweets_relevant = tweets[tweets['relevant'] == True]
tweets_relevant_with_link = tweets_relevant[tweets_relevant['link'] != '']



In [145]:

    
tweets_relevant_with_link









    Out[145]:






  
    
      
      text
      lang
      country
      python
      javascript
      ruby
      programming
      tutorial
      relevant
      link
    
  
  
    
      35
      Get it free: Free: Zenva 'Programming for Entr...
      en
      None
      False
      True
      False
      True
      False
      True
      https://t.co/yKloAI3ktN



In [165]:

    
tweets['text'][0].encode('utf-8')









    Out[165]:





"@ThatDudeDhiraj i hope you're having a brilliant time in nepal !!! \xf0\x9f\x92\x97"



In [167]:

    
tweets['text'][0]









    Out[167]:





u"@ThatDudeDhiraj i hope you're having a brilliant time in nepal !!! \U0001f497"

	text	lang	country
0	@ThatDudeDhiraj i hope you're having a brillia...	en	None
1	@wiuzz Bali, Laos, Nepal, Jamaica, África do S...	pt	Brazil
2	RT @RinconDSegunda: El REAL OVIEDO traspasa fr...	es	None
3	Ho appena letto una storia terrificante di una...	it	None
4	"This trip pushed me far out of my comfort zon...	en	None
5	RT @shtl1980: 3. India will build biggest ever...	en	None
6	RT @elmundoes: El Padre Alfaro lleva 15 años e...	es	None
7	Review: Executive Lounge, Kathmandu KTM #avgee...	en	None
8	RT @DrGPradhan: Nepal, Orisa & UP too have...	en	None
9	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
10	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
11	RT @SomeecardES: Al menos con la notificación ...	es	None
12	@LimaDelta7 @ChrisPen2530 poland, nepal (rando...	en	None
13	RT @CorriereSociale: #BreakingNews #Bambini ne...	it	None
14	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
15	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
16	७६५४२९ : सात लाख पैंसट्ठी हजार चारसय उनन्तिस #...	ne	None
17	७६५४३० : सात लाख पैंसट्ठी हजार चारसय तिस #Budd...	ne	None
18	७६५४३१ : सात लाख पैंसट्ठी हजार चारसय एकत्तिस #...	ne	None
19	७६५४३२ : सात लाख पैंसट्ठी हजार चारसय बत्तिस #B...	ne	None
20	७६५४३३ : सात लाख पैंसट्ठी हजार चारसय तेत्तिस #...	ne	None
21	RT @CorriereSociale: #BreakingNews #Bambini ne...	it	None
22	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
23	RT @amightygirl: How @maggiedoyne's year in Ne...	en	None
24	७६५४३४ : सात लाख पैंसट्ठी हजार चारसय चौँतिस #B...	ne	None
25	७६५४३५ : सात लाख पैंसट्ठी हजार चारसय पैँतिस #B...	ne	None
26	७६५४३६ : सात लाख पैंसट्ठी हजार चारसय छत्तिस #B...	ne	None
27	७६५४३७ : सात लाख पैंसट्ठी हजार चारसय सैँतीस #B...	ne	None
28	७६५४३८ : सात लाख पैंसट्ठी हजार चारसय अठतीस #Bu...	ne	None
29	७६५४३९ : सात लाख पैंसट्ठी हजार चारसय उनन्चालीस...	ne	None
...	...	...	...
40	RT @projectsabroad: "This trip pushed me far o...	en	None
41	अहिले नेपालको समय अनुसार July 14, 2016 at 01:0...	ne	None
42	RT @IWMI_: Silent springs: saving the vanishin...	en	None
43	RT @4FreedominIran: #VIDEO Speech of Bandana R...	en	None
44	RT @Myth_busterz: #ModiForeignAchievements\n@n...	en	None
45	RT @elmundoes: El Padre Alfaro lleva 15 años e...	es	None
46	CCTV, the seconds of the earthquake in nepal. ...	en	None
47	RT @pa1yandigeri: The Yemen operations,nepal r...	en	None
48	Fıldır fıldır gözlerle herkesi baştan aşağı sü...	tr	None
49	RT @Ask_Saffron: India will build Biggest ever...	en	None
50	नेपाली मोडल रेश्मा थापा #Reshma #Thapa \n#Nepa...	ne	None
51	RT @AnupKaphle: Nepal’s 1st female chief justi...	en	None
52	Nepal's Bindu Pariyar Bi-Sexual three-Way http...	en	None
53	RT @Rahulrautwrites: And now #Sultan creates H...	en	None
54	I AM IMMEDIATELY MOVING TO NEPAL WHERE I SHALL...	en	None
55	RT @seywarddarby: Latest cover story for @Fore...	en	None
56	‘India toppled Oli-led government to spoil Chi...	en	None
57	Yüzlerinden hıyanetlik okunan Ülkeler 😂😂😂\n...	tr	None
58	The greatest story never told: Davies, while n...	en	Nepal
59	CCTV, the seconds of the earthquake in nepal. ...	en	None
60	भानुभक्त कस्ता कवि हुन् ? ‘आदिकवि’ वा ‘अप्रगति...	ne	None
61	RT @FaktaGoogle: Sebagian mata uang kertas rup...	in	None
62	@Keventersshake 5 Bottles #WinHampers #Kevente...	en	None
63	RT @Keventersshake: Spot hidden Keventers shak...	en	None
64	Brighton Bracelet Nepal Bangle Black Acrylic T...	en	None
65	RT @OpenGovHub: TONIGHT! Don't miss @AccountLa...	en	None
66	RT @OpenGovHub: TONIGHT! Don't miss @AccountLa...	en	None
67	news:Maoists, opposition join forces to try to...	en	None
68	Nepal Appoints First Woman Chief Justice https...	en	None
69	[🍷CULTURA DEL #VINO] #Nepal aprende sobre #vi...	es	None