In [77]:
from IPython.display import Javascript
Javascript("""window.load_remote_theme = false; var theme_url = "https://drostehk.github.io/ipynb-theme/"; var asset_url = 'https://raw.githubusercontent.com/tijptjik/DS_assets/master/'; window.load_local_theme = function(){ var hostname = document.location.hostname; return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)}; var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_url + 'custom.js'
$.getScript(url)""")
Out[77]:
In [78]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
In [79]:
import string
import requests
from StringIO import StringIO as StrIO
In [80]:
DATA_URL = 'https://kickass.to/hourlydump.txt.gz'
In [81]:
r = requests.get(DATA_URL, stream=True)
Now, we have a Response object called r
. We can get all the information we need from this object.
In [82]:
r.status_code
Out[82]:
In [83]:
r.headers
Out[83]:
In [84]:
r.text[:1000]
Out[84]:
In [85]:
r = requests.get(DATA_URL, stream=True)
In [86]:
raw_lines = r.raw.read()
In [87]:
fileobj = StrIO(raw_lines)
In [88]:
import gzip
In [89]:
lines = gzip.GzipFile(fileobj=fileobj)
In [90]:
lines
Out[90]:
In [91]:
lines = list(lines)
In [ ]:
lines = [line.decode('utf-8').strip() for line in lines]
In [ ]:
for line in lines:
print(line)
In [ ]:
clean_lines = []
for line in lines:
line=line.rstrip().split('|')
clean_lines.append(line)
In [ ]:
clean_lines[:10]
In [ ]:
import pandas as pd
In [ ]:
df = pd.DataFrame(clean_lines)
In [ ]:
df.info()
In [ ]:
columns = ['info_hash', 'name', 'category', 'info_url','download_url',
'size','category_id','files_count','seeders','leechers','upload_date']
df.columns = columns
In [ ]:
df.head()
In [ ]:
category_count = df['category'].value_counts()
In [ ]:
category_count
In [ ]:
import seaborn as sns
In [ ]:
sns.barplot('category', data=df);
In [ ]:
df = df[df.category != 'XXX']
In [ ]:
df.ix[1:20,'name']
In [ ]:
dfs = df['name'].value_counts() > 1
In [ ]:
dfs.head()
In [ ]:
dfs = dfs.sort_index()
dfs.ix[7:17]
In [ ]:
dfx = df.drop_duplicates(subset=['name']).sort('name')
In [ ]:
len(dfx)
In [ ]:
dfx[dfs.values]
In [ ]:
len(dfx[dfs.values])
In [ ]:
subset = dfx[dfs.values]
In [ ]:
sns.barplot('category',data=subset)
In [ ]:
df.name
In [ ]:
df[df['name'].str.contains("Principles")]
In [ ]:
df['name'].apply(lambda x: unicode(x))
In [ ]:
df['name'].str.lower().str.split()
In [ ]:
from collections import Counter
In [ ]:
result = Counter(" ".join(df['name'].values.tolist()).split(" ")).items()
In [ ]:
result.sort()
In [ ]:
result
In [ ]:
word_count = pd.DataFrame(result, columns=['token','count'])
In [ ]:
word_count
In [ ]:
word_count.sort('count',ascending=False)
In [ ]:
select = word_count['token'].str.len() > 8
In [ ]:
word_count[select]
In [ ]:
word_count[select].sort('count',ascending=False)[1:110]
In [ ]:
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
In [ ]:
import numpy as np
word_count['has_digits'] = word_count['token'].apply(hasNumbers)
In [ ]:
word_count['has_digits'].value_counts()
In [ ]:
word_count[-word_count['has_digits']]
In [ ]:
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
In [ ]:
def hasPunctuation(inputString):
return any(chat in punctuations for char in inputString)