notebook.community

Edit and run



In [1]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
import numpy as np
import pandas as pd



In [3]:

    
df = pd.read_csv('top-1m.csv')



In [4]:

    
df









    Out[4]:






  
    
      
      serial
      site
    
  
  
    
      0
      1
      google.com
    
    
      1
      2
      facebook.com
    
    
      2
      3
      youtube.com
    
    
      3
      4
      baidu.com
    
    
      4
      5
      yahoo.com
    
    
      5
      6
      amazon.com
    
    
      6
      7
      wikipedia.org
    
    
      7
      8
      qq.com
    
    
      8
      9
      taobao.com
    
    
      9
      10
      twitter.com
    
    
      10
      11
      google.co.in
    
    
      11
      12
      live.com
    
    
      12
      13
      sina.com.cn
    
    
      13
      14
      linkedin.com
    
    
      14
      15
      weibo.com
    
    
      15
      16
      yahoo.co.jp
    
    
      16
      17
      google.co.jp
    
    
      17
      18
      ebay.com
    
    
      18
      19
      yandex.ru
    
    
      19
      20
      vk.com
    
    
      20
      21
      tmall.com
    
    
      21
      22
      blogspot.com
    
    
      22
      23
      google.de
    
    
      23
      24
      hao123.com
    
    
      24
      25
      t.co
    
    
      25
      26
      msn.com
    
    
      26
      27
      google.co.uk
    
    
      27
      28
      bing.com
    
    
      28
      29
      amazon.co.jp
    
    
      29
      30
      instagram.com
    
    
      ...
      ...
      ...
    
    
      999970
      999971
      youtubeforchildren.com
    
    
      999971
      999972
      shoppingdirectory.ws
    
    
      999972
      999973
      fashhackaustralia.com
    
    
      999973
      999974
      modatakipet.com
    
    
      999974
      999975
      etanto.pl
    
    
      999975
      999976
      n1nj4.com
    
    
      999976
      999977
      dynamic.ca
    
    
      999977
      999978
      redserver.su
    
    
      999978
      999979
      sabanne.fr
    
    
      999979
      999980
      the-vampire-diaries.org
    
    
      999980
      999981
      biokplus.com
    
    
      999981
      999982
      projectgus.com
    
    
      999982
      999983
      saigonprop.com
    
    
      999983
      999984
      freepornolinks.com
    
    
      999984
      999985
      swiatwakacji.pl
    
    
      999985
      999986
      pornocaiunet.blogspot.com.br
    
    
      999986
      999987
      mansbest.ru
    
    
      999987
      999988
      adverterenbijeisma.nl
    
    
      999988
      999989
      lefilmfrancais.com
    
    
      999989
      999990
      floridagunrights.org
    
    
      999990
      999991
      themarkcorp.com
    
    
      999991
      999992
      smpa.or.kr
    
    
      999992
      999993
      cncgeeks.ca
    
    
      999993
      999994
      ziegler-coaching.de
    
    
      999994
      999995
      crsmithdev.com
    
    
      999995
      999996
      dc.com.tw
    
    
      999996
      999997
      supersentai.com
    
    
      999997
      999998
      bosal.co.za
    
    
      999998
      999999
      ahlulbaytportal.com
    
    
      999999
      1000000
      glowingeyegames.com
    
  

1000000 rows × 2 columns



In [5]:

    
# TLD (Top Level Domain)
df['TLD'] = map((lambda x: '.'.join(x.split('.')[-2:]) if '.co.' in x else x.split('.')[-1]), df['site'])



In [6]:

    
df









    Out[6]:






  
    
      
      serial
      site
      TLD
    
  
  
    
      0
      1
      google.com
      com
    
    
      1
      2
      facebook.com
      com
    
    
      2
      3
      youtube.com
      com
    
    
      3
      4
      baidu.com
      com
    
    
      4
      5
      yahoo.com
      com
    
    
      5
      6
      amazon.com
      com
    
    
      6
      7
      wikipedia.org
      org
    
    
      7
      8
      qq.com
      com
    
    
      8
      9
      taobao.com
      com
    
    
      9
      10
      twitter.com
      com
    
    
      10
      11
      google.co.in
      co.in
    
    
      11
      12
      live.com
      com
    
    
      12
      13
      sina.com.cn
      cn
    
    
      13
      14
      linkedin.com
      com
    
    
      14
      15
      weibo.com
      com
    
    
      15
      16
      yahoo.co.jp
      co.jp
    
    
      16
      17
      google.co.jp
      co.jp
    
    
      17
      18
      ebay.com
      com
    
    
      18
      19
      yandex.ru
      ru
    
    
      19
      20
      vk.com
      com
    
    
      20
      21
      tmall.com
      com
    
    
      21
      22
      blogspot.com
      com
    
    
      22
      23
      google.de
      de
    
    
      23
      24
      hao123.com
      com
    
    
      24
      25
      t.co
      co
    
    
      25
      26
      msn.com
      com
    
    
      26
      27
      google.co.uk
      co.uk
    
    
      27
      28
      bing.com
      com
    
    
      28
      29
      amazon.co.jp
      co.jp
    
    
      29
      30
      instagram.com
      com
    
    
      ...
      ...
      ...
      ...
    
    
      999970
      999971
      youtubeforchildren.com
      com
    
    
      999971
      999972
      shoppingdirectory.ws
      ws
    
    
      999972
      999973
      fashhackaustralia.com
      com
    
    
      999973
      999974
      modatakipet.com
      com
    
    
      999974
      999975
      etanto.pl
      pl
    
    
      999975
      999976
      n1nj4.com
      com
    
    
      999976
      999977
      dynamic.ca
      ca
    
    
      999977
      999978
      redserver.su
      su
    
    
      999978
      999979
      sabanne.fr
      fr
    
    
      999979
      999980
      the-vampire-diaries.org
      org
    
    
      999980
      999981
      biokplus.com
      com
    
    
      999981
      999982
      projectgus.com
      com
    
    
      999982
      999983
      saigonprop.com
      com
    
    
      999983
      999984
      freepornolinks.com
      com
    
    
      999984
      999985
      swiatwakacji.pl
      pl
    
    
      999985
      999986
      pornocaiunet.blogspot.com.br
      br
    
    
      999986
      999987
      mansbest.ru
      ru
    
    
      999987
      999988
      adverterenbijeisma.nl
      nl
    
    
      999988
      999989
      lefilmfrancais.com
      com
    
    
      999989
      999990
      floridagunrights.org
      org
    
    
      999990
      999991
      themarkcorp.com
      com
    
    
      999991
      999992
      smpa.or.kr
      kr
    
    
      999992
      999993
      cncgeeks.ca
      ca
    
    
      999993
      999994
      ziegler-coaching.de
      de
    
    
      999994
      999995
      crsmithdev.com
      com
    
    
      999995
      999996
      dc.com.tw
      tw
    
    
      999996
      999997
      supersentai.com
      com
    
    
      999997
      999998
      bosal.co.za
      co.za
    
    
      999998
      999999
      ahlulbaytportal.com
      com
    
    
      999999
      1000000
      glowingeyegames.com
      com
    
  

1000000 rows × 3 columns



In [7]:

    
freq = df.groupby('TLD').count()
del freq['site']
freq.columns = ['Frequency']
sorted_freq = freq.sort(axis=0, columns='Frequency', ascending=False)



In [8]:

    
sorted_freq[:20].plot(kind='bar') #top 20 TLDs









    Out[8]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feabb4c0750>



In [9]:

    
sorted_freq[1:20].plot(kind='bar')  #TLDs from 2-20









    Out[9]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feab6831b10>



In [10]:

    
slots=range(0,1000000,1000)
dfslots = map((lambda x: df[x:x+1000].groupby('TLD').count()), slots)



In [11]:

    
def tld_frequency(tld):
    perc_index = []
    perc_columns = []
    counter = 0
    for each in dfslots:
        tld_count = each.loc[tld].site
        total_count = each.sum().site
        percentage = (float(tld_count)/float(total_count))*100
        perc_columns.append(percentage)
        perc_index.append("{0}-{1}".format(counter, counter+1000))
        counter+=1000
    return pd.DataFrame(perc_columns, index=perc_index, columns=[tld])



In [12]:

    
#Get the percentage frequency for '.net' TLD
perc_tld = tld_frequency('net')



In [13]:

    
#Plot first 100 rows from dataframe
perc_tld[:100].plot(kind='line')









    Out[13]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feab51afd10>



In [14]:

    
#Plot all the rows in the dataframe
perc_tld.plot(kind='line')









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feab42a5bd0>



In [15]:

    
#Get the percentage frequency for '.in' TLD
perc_tld = tld_frequency('in')



In [16]:

    
#Plot first 100 rows
perc_tld[:100].plot(kind='line')

"""
We were trying to see if the percentage of the TLD drops as we go down in the list of top 1 million sites.
"""









    Out[16]:





<matplotlib.axes._subplots.AxesSubplot at 0x7feab426cdd0>



In [ ]:

    
"""
Conclusion: The percentage of TLD fluctuates throughout.
"""

	serial	site
0	1	google.com
1	2	facebook.com
2	3	youtube.com
3	4	baidu.com
4	5	yahoo.com
5	6	amazon.com
6	7	wikipedia.org
7	8	qq.com
8	9	taobao.com
9	10	twitter.com
10	11	google.co.in
11	12	live.com
12	13	sina.com.cn
13	14	linkedin.com
14	15	weibo.com
15	16	yahoo.co.jp
16	17	google.co.jp
17	18	ebay.com
18	19	yandex.ru
19	20	vk.com
20	21	tmall.com
21	22	blogspot.com
22	23	google.de
23	24	hao123.com
24	25	t.co
25	26	msn.com
26	27	google.co.uk
27	28	bing.com
28	29	amazon.co.jp
29	30	instagram.com
...	...	...
999970	999971	youtubeforchildren.com
999971	999972	shoppingdirectory.ws
999972	999973	fashhackaustralia.com
999973	999974	modatakipet.com
999974	999975	etanto.pl
999975	999976	n1nj4.com
999976	999977	dynamic.ca
999977	999978	redserver.su
999978	999979	sabanne.fr
999979	999980	the-vampire-diaries.org
999980	999981	biokplus.com
999981	999982	projectgus.com
999982	999983	saigonprop.com
999983	999984	freepornolinks.com
999984	999985	swiatwakacji.pl
999985	999986	pornocaiunet.blogspot.com.br
999986	999987	mansbest.ru
999987	999988	adverterenbijeisma.nl
999988	999989	lefilmfrancais.com
999989	999990	floridagunrights.org
999990	999991	themarkcorp.com
999991	999992	smpa.or.kr
999992	999993	cncgeeks.ca
999993	999994	ziegler-coaching.de
999994	999995	crsmithdev.com
999995	999996	dc.com.tw
999996	999997	supersentai.com
999997	999998	bosal.co.za
999998	999999	ahlulbaytportal.com
999999	1000000	glowingeyegames.com