In [30]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import matplotlib.pyplot as plt

In [31]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['colors']
`%matplotlib` prevents importing * from pylab and numpy

In [32]:
r = requests.get("http://pythonforengineers.com/reddit-raw-data/")

data = r.text

soup = BeautifulSoup(data)

In [33]:
data_found = None
for s in soup('p'):
    string_found = re.findall("[\w]*\:[\d]+", s.text)
    if string_found:
        data_found = string_found

print data_found


[u'c_language:2975', u'cpp:25270', u'csharp:17401', u'objectivec:4039', u'd_language:1584', u'java:37226', u'smalltalk:797', u'golang:12353', u'scala:7264', u'groovy:1124', u'delphi:592', u'python:88347', u'ruby:26401', u'perl:8951', u'Tcl:519', u'lua:3391', u'php:33953', u'javascript:57747', u'fsharp:1719', u'haskell:18614', u'ocaml:2089', u'lisp:9517', u'scheme:3305', u'erlang:4047', u'matlab:6884', u'brainfuck:117', u'cobol:342', u'fortran:833', u'visualbasic:1822']

In [34]:
data_dict = {}
for data in data_found:
    temp = data.split(":")
    data_dict[temp[0]] = int(temp[1])
    
print data_dict


{u'fsharp': 1719, u'golang': 12353, u'haskell': 18614, u'brainfuck': 117, u'csharp': 17401, u'smalltalk': 797, u'java': 37226, u'scala': 7264, u'delphi': 592, u'perl': 8951, u'lua': 3391, u'matlab': 6884, u'objectivec': 4039, u'scheme': 3305, u'python': 88347, u'javascript': 57747, u'php': 33953, u'ruby': 26401, u'groovy': 1124, u'erlang': 4047, u'visualbasic': 1822, u'lisp': 9517, u'ocaml': 2089, u'd_language': 1584, u'Tcl': 519, u'fortran': 833, u'cpp': 25270, u'cobol': 342, u'c_language': 2975}

In [35]:
column_names =  ['Language', 'num_subscribers']
reddit_data = pd.DataFrame(data_dict.items(), columns=column_names)

In [36]:
print reddit_data


       Language  num_subscribers
0        fsharp             1719
1        golang            12353
2       haskell            18614
3     brainfuck              117
4        csharp            17401
5     smalltalk              797
6          java            37226
7         scala             7264
8        delphi              592
9          perl             8951
10          lua             3391
11       matlab             6884
12   objectivec             4039
13       scheme             3305
14       python            88347
15   javascript            57747
16          php            33953
17         ruby            26401
18       groovy             1124
19       erlang             4047
20  visualbasic             1822
21         lisp             9517
22        ocaml             2089
23   d_language             1584
24          Tcl              519
25      fortran              833
26          cpp            25270
27        cobol              342
28   c_language             2975

In [37]:
reddit_data.set_index("Language", inplace=True)
print reddit_data


             num_subscribers
Language                    
fsharp                  1719
golang                 12353
haskell                18614
brainfuck                117
csharp                 17401
smalltalk                797
java                   37226
scala                   7264
delphi                   592
perl                    8951
lua                     3391
matlab                  6884
objectivec              4039
scheme                  3305
python                 88347
javascript             57747
php                    33953
ruby                   26401
groovy                  1124
erlang                  4047
visualbasic             1822
lisp                    9517
ocaml                   2089
d_language              1584
Tcl                      519
fortran                  833
cpp                    25270
cobol                    342
c_language              2975

In [38]:
print "Least popular languages:"
print reddit_data.sort("num_subscribers")[:10]


Least popular languages:
             num_subscribers
Language                    
brainfuck                117
cobol                    342
Tcl                      519
delphi                   592
smalltalk                797
fortran                  833
groovy                  1124
d_language              1584
fsharp                  1719
visualbasic             1822

In [39]:
print "Most popular languages:"
print reddit_data.sort("num_subscribers", ascending=False)[:10]


Most popular languages:
            num_subscribers
Language                   
python                88347
javascript            57747
java                  37226
php                   33953
ruby                  26401
cpp                   25270
haskell               18614
csharp                17401
golang                12353
lisp                   9517

In [40]:
reddit_data.sort("num_subscribers")[:10].plot(kind='bar', title = "The least popular languages on Reddit")

plt.tight_layout()

plt.show()



In [41]:
top_five =  reddit_data.sort("num_subscribers", ascending=False)[:5]

sum_five = float(top_five.sum())
print sum_five


243674.0

In [42]:
top_five['percent'] = (top_five['num_subscribers'] * 100) / sum_five
print top_five


            num_subscribers    percent
Language                              
python                88347  36.256228
javascript            57747  23.698466
java                  37226  15.276968
php                   33953  13.933780
ruby                  26401  10.834558

In [43]:
colors_mine = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'lightcyan']
explode = (0.1, 0, 0, 0, 0) # only "explode" the 2nd slice (i.red_panda. 'Hogs')
top_five['percent'].plot(kind="pie", autopct='%.2f%%', shadow=True, colors = colors_mine, explode=explode, startangle=90, title = "The Top 5 languages on Reddit")

plt.show()



In [43]: