In [77]:
    
from IPython.display import Javascript
Javascript("""window.load_remote_theme = false; var theme_url = "https://drostehk.github.io/ipynb-theme/"; var asset_url = 'https://raw.githubusercontent.com/tijptjik/DS_assets/master/'; window.load_local_theme = function(){ var hostname = document.location.hostname; return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)}; var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_url + 'custom.js'
$.getScript(url)""")
    
    Out[77]:
In [78]:
    
import sys
reload(sys)
sys.setdefaultencoding("utf-8")
    
In [79]:
    
import string
import requests
from StringIO import StringIO as StrIO
    
In [80]:
    
DATA_URL = 'https://kickass.to/hourlydump.txt.gz'
    
In [81]:
    
r = requests.get(DATA_URL, stream=True)
    
Now, we have a Response object called r. We can get all the information we need from this object.
In [82]:
    
r.status_code
    
    Out[82]:
In [83]:
    
r.headers
    
    Out[83]:
In [84]:
    
r.text[:1000]
    
    Out[84]:
In [85]:
    
r = requests.get(DATA_URL, stream=True)
    
In [86]:
    
raw_lines = r.raw.read()
    
In [87]:
    
fileobj = StrIO(raw_lines)
    
In [88]:
    
import gzip
    
In [89]:
    
lines = gzip.GzipFile(fileobj=fileobj)
    
In [90]:
    
lines
    
    Out[90]:
In [91]:
    
lines = list(lines)
    
In [ ]:
    
lines = [line.decode('utf-8').strip() for line in lines]
    
In [ ]:
    
for line in lines:
    print(line)
    
In [ ]:
    
clean_lines = []
for line in lines:
    line=line.rstrip().split('|')
    clean_lines.append(line)
    
In [ ]:
    
clean_lines[:10]
    
In [ ]:
    
import pandas as pd
    
In [ ]:
    
df = pd.DataFrame(clean_lines)
    
In [ ]:
    
df.info()
    
In [ ]:
    
columns = ['info_hash', 'name', 'category', 'info_url','download_url',
           'size','category_id','files_count','seeders','leechers','upload_date']
df.columns = columns
    
In [ ]:
    
df.head()
    
In [ ]:
    
category_count = df['category'].value_counts()
    
In [ ]:
    
category_count
    
In [ ]:
    
import seaborn as sns
    
In [ ]:
    
sns.barplot('category', data=df);
    
In [ ]:
    
df = df[df.category != 'XXX']
    
In [ ]:
    
df.ix[1:20,'name']
    
In [ ]:
    
dfs = df['name'].value_counts() > 1
    
In [ ]:
    
dfs.head()
    
In [ ]:
    
dfs = dfs.sort_index()
dfs.ix[7:17]
    
In [ ]:
    
dfx = df.drop_duplicates(subset=['name']).sort('name')
    
In [ ]:
    
len(dfx)
    
In [ ]:
    
dfx[dfs.values]
    
In [ ]:
    
len(dfx[dfs.values])
    
In [ ]:
    
subset = dfx[dfs.values]
    
In [ ]:
    
sns.barplot('category',data=subset)
    
In [ ]:
    
df.name
    
In [ ]:
    
df[df['name'].str.contains("Principles")]
    
In [ ]:
    
df['name'].apply(lambda x: unicode(x))
    
In [ ]:
    
df['name'].str.lower().str.split()
    
In [ ]:
    
from collections import Counter
    
In [ ]:
    
result = Counter(" ".join(df['name'].values.tolist()).split(" ")).items()
    
In [ ]:
    
result.sort()
    
In [ ]:
    
result
    
In [ ]:
    
word_count = pd.DataFrame(result, columns=['token','count'])
    
In [ ]:
    
word_count
    
In [ ]:
    
word_count.sort('count',ascending=False)
    
In [ ]:
    
select = word_count['token'].str.len() > 8
    
In [ ]:
    
word_count[select]
    
In [ ]:
    
word_count[select].sort('count',ascending=False)[1:110]
    
In [ ]:
    
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
    
In [ ]:
    
import numpy as np
word_count['has_digits'] = word_count['token'].apply(hasNumbers)
    
In [ ]:
    
word_count['has_digits'].value_counts()
    
In [ ]:
    
word_count[-word_count['has_digits']]
    
In [ ]:
    
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    
In [ ]:
    
def hasPunctuation(inputString):
    return any(chat in punctuations for char in inputString)