In [1]:

    
%pylab inline
import pandas as pd
from textwrap import dedent









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
!ls ../attempt3_19k/









    



fetch_yahoo.csv		outgoingurls_yahoo_new.csv  urls_yahoo_new.csv
outgoingurls_yahoo.csv	urls_yahoo.csv		    visit_yahoo.csv

Process URLs



In [15]:

    
urls_to_write = 'url,status\n'

with open('../attempt3_19k/urls_yahoo.csv') as fh:
    for index, line in enumerate(fh):
        if index == 0:
            continue
        splitted = line.split(',')
        if len(splitted) == 2:
            url = splitted[0]
            status = splitted[1]
        elif len(splitted) >= 3:
            url = '-'.join(splitted[0:len(splitted)-1])
            status = splitted[-1]
        else:
            print(line)
            raise RuntimeError
        
        url = url.replace(",", "-")
        if 'https://news.yahoo.com' in url:
            status = 'OK'
        elif 'https://www.yahoo.com/news' in url:
            status = 'OK'
        else:
            status = 'N_OK'
        urls_to_write += '{},{}\n'.format(url,status)
with open('../attempt3_19k/urls_yahoo_new.csv', 'w') as fh:
    fh.write(urls_to_write)



In [16]:

    
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
urls_df.status.value_counts()









    Out[16]:





N_OK    710048
OK           3
Name: status, dtype: int64



In [ ]:

    
# Process outgoing urls
urls_to_write = 'url\n'
with open('../attempt3_19k/outgoingurls_yahoo.csv') as fh:
    for index, line in enumerate(fh):
        if index == 0:
            continue
        line = line.replace(',', '-')
        urls_to_write += '{}\n'.format(line)
with open('../attempt3_19k/outgoingurls_yahoo_new.csv', 'w') as fh:
    fh.write(urls_to_write)



In [4]:

    
fetch_df = pd.read_csv('../attempt3_19k/fetch_yahoo.csv')
outgoingurls = pd.read_csv('../attempt3_19k/outgoingurls_yahoo_new.csv', header=0)
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
visits_df = pd.read_csv('../attempt3_19k/visit_yahoo.csv')



In [5]:

    
fetch_df.http_status_code.value_counts()









    Out[5]:





200    18988
Name: http_status_code, dtype: int64



In [6]:

    
fetch_df.http_status_code.value_counts()[200]









    Out[6]:





18988



In [7]:

    
fetches_attempted = fetch_df.shape[0]
fetches_succeeded = fetch_df.http_status_code.value_counts()[200]
fetches_failed = fetches_attempted - fetches_succeeded



In [50]:



In [10]:

    
print(template)









    



Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: 18988
# fetches succeeded: 18988
# fetches failed or aborted: 0



In [ ]:



In [19]:

    
total_urls_extracted = outgoingurls.shape[0]
unique_urls_extracted = len(outgoingurls.url.unique())
unique_urls_within_site = 0
unique_urls_outside_site = 0


for url in outgoingurls.url.unique():
    if 'https://news.yahoo.com/' in url:
        unique_urls_within_site += 1
    elif 'https://www.yahoo.com/news' in url:
        unique_urls_within_site += 1
    else:
        unique_urls_outside_site += 1



In [51]:



In [22]:

    
print(template)









    



Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: 18988
# fetches succeeded: 18988
# fetches failed or aborted: 0
Outgoing URLs:
==============
Total URLs extracted: 1151638
# unique URLs extracted: 142207
# unique URLs within News Site: 23922
# unique URLs outside News Site: 118285



In [23]:

    
status_200 = fetch_df.http_status_code.value_counts()[200]
status_301 = 0
status_401 = 0
status_403 = 0
status_404 = 0



In [52]:



In [32]:

    
visits_df['size_category'] = pd.cut(visits_df['size'].values, bins=[0,1023, 10239, 102399, 1048576, visits_df['size'].max()],
                                    labels=['< 1KB', '1KB ~ <10KB', '10KB ~ <100KB', '100KB ~ <1MB', '>= 1MB'])



In [37]:

    
visits_df_size_category_vc = visits_df.size_category.value_counts()
visits_df_size_category_vc









    Out[37]:





100KB ~ <1MB     15410
10KB ~ <100KB     3576
>= 1MB               2
1KB ~ <10KB          0
< 1KB                0
Name: size_category, dtype: int64



In [38]:

    
size_1kb = visits_df_size_category_vc['< 1KB']
size_10kb = visits_df_size_category_vc['1KB ~ <10KB']
size_100kb = visits_df_size_category_vc['10KB ~ <100KB']
size_1mb = visits_df_size_category_vc['100KB ~ <1MB']
size_higher = visits_df_size_category_vc['>= 1MB']



In [53]:



In [40]:

    
visits_df.content_type.value_counts()









    Out[40]:





text/html    18988
Name: content_type, dtype: int64



In [41]:

    
type_html = visits_df.content_type.value_counts()['text/html']
type_gif = 0
type_jpeg = 0
type_png = 0
type_pdf = 0



In [56]:

    
template = dedent("""Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: {}
# fetches succeeded: {}
# fetches failed or aborted: {}""".format(fetches_attempted, fetches_succeeded, fetches_failed))

template += dedent("""\n\n
Outgoing URLs:
==============
Total URLs extracted: {}
# unique URLs extracted: {}
# unique URLs within News Site: {}
# unique URLs outside News Site: {}""".format(total_urls_extracted, unique_urls_extracted, unique_urls_within_site, unique_urls_outside_site))

template += dedent("""\n\n
Status Codes:
=============
200 OK: {}
301 Moved Permanently: {}
401 Unauthorized: {}
403 Forbidden: {}
404 Not Found: {}""".format(status_200, status_301, status_401, status_403, status_404))

template += dedent("""\n\n
File Sizes:
===========
< 1KB: {}
1KB ~ <10KB: {}
10KB ~ <100KB: {}
100KB ~ <1MB: {}
>= 1MB: {}""".format(size_1kb, size_10kb, size_100kb, size_1mb, size_higher))

template += dedent("""\n\n
Content Types:
==============

text/html: {}
image/gif: {}
image/jpeg: {}
image/png: {}
application/pdf: {} """.format(type_html, type_gif, type_jpeg, type_png, type_pdf))



In [57]:

    
with open('../attempt3_19k/CrawlReport_yahoo.csv', 'w') as fh:
    fh.write(template)



In [ ]: