In [1]:
%pylab inline
In [2]:
import numpy as np
import pandas as pd
In [3]:
df = pd.read_csv('top-1m.csv')
In [4]:
df
Out[4]:
In [5]:
# TLD (Top Level Domain)
df['TLD'] = map((lambda x: '.'.join(x.split('.')[-2:]) if '.co.' in x else x.split('.')[-1]), df['site'])
In [6]:
df
Out[6]:
In [7]:
freq = df.groupby('TLD').count()
del freq['site']
freq.columns = ['Frequency']
sorted_freq = freq.sort(axis=0, columns='Frequency', ascending=False)
In [8]:
sorted_freq[:20].plot(kind='bar') #top 20 TLDs
Out[8]:
In [9]:
sorted_freq[1:20].plot(kind='bar') #TLDs from 2-20
Out[9]:
In [10]:
slots=range(0,1000000,1000)
dfslots = map((lambda x: df[x:x+1000].groupby('TLD').count()), slots)
In [11]:
def tld_frequency(tld):
perc_index = []
perc_columns = []
counter = 0
for each in dfslots:
tld_count = each.loc[tld].site
total_count = each.sum().site
percentage = (float(tld_count)/float(total_count))*100
perc_columns.append(percentage)
perc_index.append("{0}-{1}".format(counter, counter+1000))
counter+=1000
return pd.DataFrame(perc_columns, index=perc_index, columns=[tld])
In [12]:
#Get the percentage frequency for '.net' TLD
perc_tld = tld_frequency('net')
In [13]:
#Plot first 100 rows from dataframe
perc_tld[:100].plot(kind='line')
Out[13]:
In [14]:
#Plot all the rows in the dataframe
perc_tld.plot(kind='line')
Out[14]:
In [15]:
#Get the percentage frequency for '.in' TLD
perc_tld = tld_frequency('in')
In [16]:
#Plot first 100 rows
perc_tld[:100].plot(kind='line')
"""
We were trying to see if the percentage of the TLD drops as we go down in the list of top 1 million sites.
"""
Out[16]:
In [ ]:
"""
Conclusion: The percentage of TLD fluctuates throughout.
"""