In [1]:
%pylab inline
import pandas as pd
from textwrap import dedent


Populating the interactive namespace from numpy and matplotlib

In [2]:
!ls ../attempt3_19k/


fetch_yahoo.csv		outgoingurls_yahoo_new.csv  urls_yahoo_new.csv
outgoingurls_yahoo.csv	urls_yahoo.csv		    visit_yahoo.csv

Process URLs


In [15]:
urls_to_write = 'url,status\n'

with open('../attempt3_19k/urls_yahoo.csv') as fh:
    for index, line in enumerate(fh):
        if index == 0:
            continue
        splitted = line.split(',')
        if len(splitted) == 2:
            url = splitted[0]
            status = splitted[1]
        elif len(splitted) >= 3:
            url = '-'.join(splitted[0:len(splitted)-1])
            status = splitted[-1]
        else:
            print(line)
            raise RuntimeError
        
        url = url.replace(",", "-")
        if 'https://news.yahoo.com' in url:
            status = 'OK'
        elif 'https://www.yahoo.com/news' in url:
            status = 'OK'
        else:
            status = 'N_OK'
        urls_to_write += '{},{}\n'.format(url,status)
with open('../attempt3_19k/urls_yahoo_new.csv', 'w') as fh:
    fh.write(urls_to_write)

In [16]:
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
urls_df.status.value_counts()


Out[16]:
N_OK    710048
OK           3
Name: status, dtype: int64

In [ ]:
# Process outgoing urls
urls_to_write = 'url\n'
with open('../attempt3_19k/outgoingurls_yahoo.csv') as fh:
    for index, line in enumerate(fh):
        if index == 0:
            continue
        line = line.replace(',', '-')
        urls_to_write += '{}\n'.format(line)
with open('../attempt3_19k/outgoingurls_yahoo_new.csv', 'w') as fh:
    fh.write(urls_to_write)

In [4]:
fetch_df = pd.read_csv('../attempt3_19k/fetch_yahoo.csv')
outgoingurls = pd.read_csv('../attempt3_19k/outgoingurls_yahoo_new.csv', header=0)
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
visits_df = pd.read_csv('../attempt3_19k/visit_yahoo.csv')

In [5]:
fetch_df.http_status_code.value_counts()


Out[5]:
200    18988
Name: http_status_code, dtype: int64

In [6]:
fetch_df.http_status_code.value_counts()[200]


Out[6]:
18988

In [7]:
fetches_attempted = fetch_df.shape[0]
fetches_succeeded = fetch_df.http_status_code.value_counts()[200]
fetches_failed = fetches_attempted - fetches_succeeded

In [50]:


In [10]:
print(template)


Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: 18988
# fetches succeeded: 18988
# fetches failed or aborted: 0

In [ ]:


In [19]:
total_urls_extracted = outgoingurls.shape[0]
unique_urls_extracted = len(outgoingurls.url.unique())
unique_urls_within_site = 0
unique_urls_outside_site = 0


for url in outgoingurls.url.unique():
    if 'https://news.yahoo.com/' in url:
        unique_urls_within_site += 1
    elif 'https://www.yahoo.com/news' in url:
        unique_urls_within_site += 1
    else:
        unique_urls_outside_site += 1

In [51]:


In [22]:
print(template)


Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: 18988
# fetches succeeded: 18988
# fetches failed or aborted: 0
Outgoing URLs:
==============
Total URLs extracted: 1151638
# unique URLs extracted: 142207
# unique URLs within News Site: 23922
# unique URLs outside News Site: 118285

In [23]:
status_200 = fetch_df.http_status_code.value_counts()[200]
status_301 = 0
status_401 = 0
status_403 = 0
status_404 = 0

In [52]:


In [32]:
visits_df['size_category'] = pd.cut(visits_df['size'].values, bins=[0,1023, 10239, 102399, 1048576, visits_df['size'].max()],
                                    labels=['< 1KB', '1KB ~ <10KB', '10KB ~ <100KB', '100KB ~ <1MB', '>= 1MB'])

In [37]:
visits_df_size_category_vc = visits_df.size_category.value_counts()
visits_df_size_category_vc


Out[37]:
100KB ~ <1MB     15410
10KB ~ <100KB     3576
>= 1MB               2
1KB ~ <10KB          0
< 1KB                0
Name: size_category, dtype: int64

In [38]:
size_1kb = visits_df_size_category_vc['< 1KB']
size_10kb = visits_df_size_category_vc['1KB ~ <10KB']
size_100kb = visits_df_size_category_vc['10KB ~ <100KB']
size_1mb = visits_df_size_category_vc['100KB ~ <1MB']
size_higher = visits_df_size_category_vc['>= 1MB']

In [53]:


In [40]:
visits_df.content_type.value_counts()


Out[40]:
text/html    18988
Name: content_type, dtype: int64

In [41]:
type_html = visits_df.content_type.value_counts()['text/html']
type_gif = 0
type_jpeg = 0
type_png = 0
type_pdf = 0

In [56]:
template = dedent("""Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news

Fetch Statistics
================
# fetches attempted: {}
# fetches succeeded: {}
# fetches failed or aborted: {}""".format(fetches_attempted, fetches_succeeded, fetches_failed))

template += dedent("""\n\n
Outgoing URLs:
==============
Total URLs extracted: {}
# unique URLs extracted: {}
# unique URLs within News Site: {}
# unique URLs outside News Site: {}""".format(total_urls_extracted, unique_urls_extracted, unique_urls_within_site, unique_urls_outside_site))

template += dedent("""\n\n
Status Codes:
=============
200 OK: {}
301 Moved Permanently: {}
401 Unauthorized: {}
403 Forbidden: {}
404 Not Found: {}""".format(status_200, status_301, status_401, status_403, status_404))

template += dedent("""\n\n
File Sizes:
===========
< 1KB: {}
1KB ~ <10KB: {}
10KB ~ <100KB: {}
100KB ~ <1MB: {}
>= 1MB: {}""".format(size_1kb, size_10kb, size_100kb, size_1mb, size_higher))

template += dedent("""\n\n
Content Types:
==============

text/html: {}
image/gif: {}
image/jpeg: {}
image/png: {}
application/pdf: {} """.format(type_html, type_gif, type_jpeg, type_png, type_pdf))

In [57]:
with open('../attempt3_19k/CrawlReport_yahoo.csv', 'w') as fh:
    fh.write(template)

In [ ]: