In [1]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('top-1m.csv')

In [4]:
df


Out[4]:
serial site
0 1 google.com
1 2 facebook.com
2 3 youtube.com
3 4 baidu.com
4 5 yahoo.com
5 6 amazon.com
6 7 wikipedia.org
7 8 qq.com
8 9 taobao.com
9 10 twitter.com
10 11 google.co.in
11 12 live.com
12 13 sina.com.cn
13 14 linkedin.com
14 15 weibo.com
15 16 yahoo.co.jp
16 17 google.co.jp
17 18 ebay.com
18 19 yandex.ru
19 20 vk.com
20 21 tmall.com
21 22 blogspot.com
22 23 google.de
23 24 hao123.com
24 25 t.co
25 26 msn.com
26 27 google.co.uk
27 28 bing.com
28 29 amazon.co.jp
29 30 instagram.com
... ... ...
999970 999971 youtubeforchildren.com
999971 999972 shoppingdirectory.ws
999972 999973 fashhackaustralia.com
999973 999974 modatakipet.com
999974 999975 etanto.pl
999975 999976 n1nj4.com
999976 999977 dynamic.ca
999977 999978 redserver.su
999978 999979 sabanne.fr
999979 999980 the-vampire-diaries.org
999980 999981 biokplus.com
999981 999982 projectgus.com
999982 999983 saigonprop.com
999983 999984 freepornolinks.com
999984 999985 swiatwakacji.pl
999985 999986 pornocaiunet.blogspot.com.br
999986 999987 mansbest.ru
999987 999988 adverterenbijeisma.nl
999988 999989 lefilmfrancais.com
999989 999990 floridagunrights.org
999990 999991 themarkcorp.com
999991 999992 smpa.or.kr
999992 999993 cncgeeks.ca
999993 999994 ziegler-coaching.de
999994 999995 crsmithdev.com
999995 999996 dc.com.tw
999996 999997 supersentai.com
999997 999998 bosal.co.za
999998 999999 ahlulbaytportal.com
999999 1000000 glowingeyegames.com

1000000 rows × 2 columns


In [5]:
# TLD (Top Level Domain)
df['TLD'] = map((lambda x: '.'.join(x.split('.')[-2:]) if '.co.' in x else x.split('.')[-1]), df['site'])

In [6]:
df


Out[6]:
serial site TLD
0 1 google.com com
1 2 facebook.com com
2 3 youtube.com com
3 4 baidu.com com
4 5 yahoo.com com
5 6 amazon.com com
6 7 wikipedia.org org
7 8 qq.com com
8 9 taobao.com com
9 10 twitter.com com
10 11 google.co.in co.in
11 12 live.com com
12 13 sina.com.cn cn
13 14 linkedin.com com
14 15 weibo.com com
15 16 yahoo.co.jp co.jp
16 17 google.co.jp co.jp
17 18 ebay.com com
18 19 yandex.ru ru
19 20 vk.com com
20 21 tmall.com com
21 22 blogspot.com com
22 23 google.de de
23 24 hao123.com com
24 25 t.co co
25 26 msn.com com
26 27 google.co.uk co.uk
27 28 bing.com com
28 29 amazon.co.jp co.jp
29 30 instagram.com com
... ... ... ...
999970 999971 youtubeforchildren.com com
999971 999972 shoppingdirectory.ws ws
999972 999973 fashhackaustralia.com com
999973 999974 modatakipet.com com
999974 999975 etanto.pl pl
999975 999976 n1nj4.com com
999976 999977 dynamic.ca ca
999977 999978 redserver.su su
999978 999979 sabanne.fr fr
999979 999980 the-vampire-diaries.org org
999980 999981 biokplus.com com
999981 999982 projectgus.com com
999982 999983 saigonprop.com com
999983 999984 freepornolinks.com com
999984 999985 swiatwakacji.pl pl
999985 999986 pornocaiunet.blogspot.com.br br
999986 999987 mansbest.ru ru
999987 999988 adverterenbijeisma.nl nl
999988 999989 lefilmfrancais.com com
999989 999990 floridagunrights.org org
999990 999991 themarkcorp.com com
999991 999992 smpa.or.kr kr
999992 999993 cncgeeks.ca ca
999993 999994 ziegler-coaching.de de
999994 999995 crsmithdev.com com
999995 999996 dc.com.tw tw
999996 999997 supersentai.com com
999997 999998 bosal.co.za co.za
999998 999999 ahlulbaytportal.com com
999999 1000000 glowingeyegames.com com

1000000 rows × 3 columns


In [7]:
freq = df.groupby('TLD').count()
del freq['site']
freq.columns = ['Frequency']
sorted_freq = freq.sort(axis=0, columns='Frequency', ascending=False)

In [8]:
sorted_freq[:20].plot(kind='bar') #top 20 TLDs


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feabb4c0750>

In [9]:
sorted_freq[1:20].plot(kind='bar')  #TLDs from 2-20


Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feab6831b10>

In [10]:
slots=range(0,1000000,1000)
dfslots = map((lambda x: df[x:x+1000].groupby('TLD').count()), slots)

In [11]:
def tld_frequency(tld):
    perc_index = []
    perc_columns = []
    counter = 0
    for each in dfslots:
        tld_count = each.loc[tld].site
        total_count = each.sum().site
        percentage = (float(tld_count)/float(total_count))*100
        perc_columns.append(percentage)
        perc_index.append("{0}-{1}".format(counter, counter+1000))
        counter+=1000
    return pd.DataFrame(perc_columns, index=perc_index, columns=[tld])

In [12]:
#Get the percentage frequency for '.net' TLD
perc_tld = tld_frequency('net')

In [13]:
#Plot first 100 rows from dataframe
perc_tld[:100].plot(kind='line')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feab51afd10>

In [14]:
#Plot all the rows in the dataframe
perc_tld.plot(kind='line')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feab42a5bd0>

In [15]:
#Get the percentage frequency for '.in' TLD
perc_tld = tld_frequency('in')

In [16]:
#Plot first 100 rows
perc_tld[:100].plot(kind='line')

"""
We were trying to see if the percentage of the TLD drops as we go down in the list of top 1 million sites.
"""


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x7feab426cdd0>

In [ ]:
"""
Conclusion: The percentage of TLD fluctuates throughout.
"""