In [1]:
%pylab inline
import pandas as pd
from textwrap import dedent
In [2]:
!ls ../attempt3_19k/
In [15]:
urls_to_write = 'url,status\n'
with open('../attempt3_19k/urls_yahoo.csv') as fh:
for index, line in enumerate(fh):
if index == 0:
continue
splitted = line.split(',')
if len(splitted) == 2:
url = splitted[0]
status = splitted[1]
elif len(splitted) >= 3:
url = '-'.join(splitted[0:len(splitted)-1])
status = splitted[-1]
else:
print(line)
raise RuntimeError
url = url.replace(",", "-")
if 'https://news.yahoo.com' in url:
status = 'OK'
elif 'https://www.yahoo.com/news' in url:
status = 'OK'
else:
status = 'N_OK'
urls_to_write += '{},{}\n'.format(url,status)
with open('../attempt3_19k/urls_yahoo_new.csv', 'w') as fh:
fh.write(urls_to_write)
In [16]:
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
urls_df.status.value_counts()
Out[16]:
In [ ]:
# Process outgoing urls
urls_to_write = 'url\n'
with open('../attempt3_19k/outgoingurls_yahoo.csv') as fh:
for index, line in enumerate(fh):
if index == 0:
continue
line = line.replace(',', '-')
urls_to_write += '{}\n'.format(line)
with open('../attempt3_19k/outgoingurls_yahoo_new.csv', 'w') as fh:
fh.write(urls_to_write)
In [4]:
fetch_df = pd.read_csv('../attempt3_19k/fetch_yahoo.csv')
outgoingurls = pd.read_csv('../attempt3_19k/outgoingurls_yahoo_new.csv', header=0)
urls_df = pd.read_csv('../attempt3_19k/urls_yahoo_new.csv')
visits_df = pd.read_csv('../attempt3_19k/visit_yahoo.csv')
In [5]:
fetch_df.http_status_code.value_counts()
Out[5]:
In [6]:
fetch_df.http_status_code.value_counts()[200]
Out[6]:
In [7]:
fetches_attempted = fetch_df.shape[0]
fetches_succeeded = fetch_df.http_status_code.value_counts()[200]
fetches_failed = fetches_attempted - fetches_succeeded
In [50]:
In [10]:
print(template)
In [ ]:
In [19]:
total_urls_extracted = outgoingurls.shape[0]
unique_urls_extracted = len(outgoingurls.url.unique())
unique_urls_within_site = 0
unique_urls_outside_site = 0
for url in outgoingurls.url.unique():
if 'https://news.yahoo.com/' in url:
unique_urls_within_site += 1
elif 'https://www.yahoo.com/news' in url:
unique_urls_within_site += 1
else:
unique_urls_outside_site += 1
In [51]:
In [22]:
print(template)
In [23]:
status_200 = fetch_df.http_status_code.value_counts()[200]
status_301 = 0
status_401 = 0
status_403 = 0
status_404 = 0
In [52]:
In [32]:
visits_df['size_category'] = pd.cut(visits_df['size'].values, bins=[0,1023, 10239, 102399, 1048576, visits_df['size'].max()],
labels=['< 1KB', '1KB ~ <10KB', '10KB ~ <100KB', '100KB ~ <1MB', '>= 1MB'])
In [37]:
visits_df_size_category_vc = visits_df.size_category.value_counts()
visits_df_size_category_vc
Out[37]:
In [38]:
size_1kb = visits_df_size_category_vc['< 1KB']
size_10kb = visits_df_size_category_vc['1KB ~ <10KB']
size_100kb = visits_df_size_category_vc['10KB ~ <100KB']
size_1mb = visits_df_size_category_vc['100KB ~ <1MB']
size_higher = visits_df_size_category_vc['>= 1MB']
In [53]:
In [40]:
visits_df.content_type.value_counts()
Out[40]:
In [41]:
type_html = visits_df.content_type.value_counts()['text/html']
type_gif = 0
type_jpeg = 0
type_png = 0
type_pdf = 0
In [56]:
template = dedent("""Name: Saket Choudhary
USC ID: 2170058637
News site crawled: www.yahoo.com/news
Fetch Statistics
================
# fetches attempted: {}
# fetches succeeded: {}
# fetches failed or aborted: {}""".format(fetches_attempted, fetches_succeeded, fetches_failed))
template += dedent("""\n\n
Outgoing URLs:
==============
Total URLs extracted: {}
# unique URLs extracted: {}
# unique URLs within News Site: {}
# unique URLs outside News Site: {}""".format(total_urls_extracted, unique_urls_extracted, unique_urls_within_site, unique_urls_outside_site))
template += dedent("""\n\n
Status Codes:
=============
200 OK: {}
301 Moved Permanently: {}
401 Unauthorized: {}
403 Forbidden: {}
404 Not Found: {}""".format(status_200, status_301, status_401, status_403, status_404))
template += dedent("""\n\n
File Sizes:
===========
< 1KB: {}
1KB ~ <10KB: {}
10KB ~ <100KB: {}
100KB ~ <1MB: {}
>= 1MB: {}""".format(size_1kb, size_10kb, size_100kb, size_1mb, size_higher))
template += dedent("""\n\n
Content Types:
==============
text/html: {}
image/gif: {}
image/jpeg: {}
image/png: {}
application/pdf: {} """.format(type_html, type_gif, type_jpeg, type_png, type_pdf))
In [57]:
with open('../attempt3_19k/CrawlReport_yahoo.csv', 'w') as fh:
fh.write(template)
In [ ]: