In [77]:
from IPython.display import Javascript
Javascript("""window.load_remote_theme = false; var theme_url = "https://drostehk.github.io/ipynb-theme/"; var asset_url = 'https://raw.githubusercontent.com/tijptjik/DS_assets/master/'; window.load_local_theme = function(){ var hostname = document.location.hostname; return ((hostname == "localhost" || hostname == '127.0.0.1') && !load_remote_theme)}; var url = load_local_theme() ? document.location.origin + "/files/theme/custom.js" : theme_url + 'custom.js'
$.getScript(url)""")


Out[77]:

Intro to Python for Data Analysis

Applying what we know : Torrent Analysis


In [78]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

In [79]:
import string
import requests
from StringIO import StringIO as StrIO

In [80]:
DATA_URL = 'https://kickass.to/hourlydump.txt.gz'

In [81]:
r = requests.get(DATA_URL, stream=True)

Now, we have a Response object called r. We can get all the information we need from this object.


In [82]:
r.status_code


Out[82]:
200

In [83]:
r.headers


Out[83]:
{'content-length': '733702', 'accept-ranges': 'bytes', 'server': 'nginx', 'last-modified': 'Thu, 28 May 2015 10:00:02 GMT', 'connection': 'keep-alive', 'etag': '"5566e722-b3206"', 'date': 'Thu, 28 May 2015 10:39:03 GMT', 'content-type': 'application/octet-stream'}

In [84]:
r.text[:1000]


Out[84]:
u'\x1f\ufffd\x08\x08"\ufffdfU\x00\x03hourlydump.txt\x00\ufffdZ]s\ufffdF\ufffd}\ufffd\u89ddLU\ufffd\ufffdn\ufffd\ufffd\x01\r\ufffdvl9\x1a\ufffdc\'\ufffd\ufffd\x03DB""\ufffd\ufffd\x00\ufffddf\ufffd\ufffd}O\x03\x14I\ufffd,\x07[5\x13W$Q"N\ufffd\x0f\ufffd=\ufffd\ufffd\u06f0\ufffd\ufffdLs.\ufffd\u034c0.\ufffd<\x16:\ufffd\ufffd*\ufffd\ufffd\u0608T9\ufffd<\ufffd}S\x16]u_\ufffd\x15\ufffdm\ufffd\x11J\ufffd,\ufffd\u02fa^\ufffd\ufffd\ufffd&\u0772$\x1f\ufffd\ufffd|\x12\ufffd\ufffd\x15;"\x18W\ufffd\ufffd?_\ufffd\ufffdg\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd\ufffdU5\x07B\ufffdi\ufffde\ufffdm\ufffd\ufffd\ufffd\ufffd\u06e2{1o~\ufffd\ufffd\ufffd\x034\ufffd\x060\ufffd\t\ufffd\ufffd\ufffd\ufffd\x00\ufffd\x0f\x00\ufffdB\ufffdu\ufffd\ufffd\x01\ufffdV\ufffd\ufffdz\ufffd\u044e3\ufffdX\ufffd\u060be\ufffd^\r\ufffd@\ufffd\ufffdf^\u0317\ufffdM\u0645\x17M\ufffd\ufffd~\ufffd\x13\ufffd\ufffdb\x7fA\ufffd5\ufffd\ufffd\ufffd,\ufffd\ufffdu\ufffd\ufffd\ufffd\ufffd\\FBK&\ufffd\ufffdY\ufffd\ufffd\u061a$\ufffdY\ufffdr%t\ufffd:\ufffdz\ufffds\ufffd\ufffd&\u0452GR\ufffdH\ufffdo\ue68e\ufffd\ufffd\ufffd\ufffd6\ufffd\ufffdt\ufffdM\x19\ufffdzU\ufffdo\x07\ufffd\x08g\ufffdmI\ufffd\ufffd\ufffd\x00k_\ufffdO~{\ufffd\ufffdF\ufffduC\u06a6\ufffd\x7f\x01\x01\u037f2v\x0bP:\x1f@\ufffdz\x04\r\\-\x01:\x124\ufffd\u04ab\ufffd]\x03\ufffd\x02(\ufffdU\u05fb\x00J\x01z \ufffdO n\ufffdFO\ufffd\ufffd\ufffdz\ufffd\ufffd\ufffd\x1dH\ufffdn\ufffd2\ufffd\ufffdT\ufffd\ufffdy\ufffdDZj\ufffdx-\ufffd\ufffd\ufffd\x18\ufffdP\ufffdH\ufffdDG\ufffd\ufffd\ufffd\ufffd/\ufffd\u056a\ufffd,J\x12\ufffd\ufffd/UK|\ufffdk\ufffdbE\ufffd\ufffd}\ufffd\ufffd!\u06e2m\x11LEW\ufffd\ufffd\ufffd\ufffd\ufffdzu\ufffd\ufffd\ufffd\ufffdOB+\x00\ufffd\x11\ufffd\x16\x03\x10\ufffd\ufffd@t\x04\ufffdG \n \n\ufffd\x03=\ufffd\ufffd\u967a\ufffd#=L\x0be\ufffd\x14\ufffdV\ufffd\u01dd0d\ufffd\x19\ufffdl\ufffd\ufffdJE\ufffdj\ufffd\ufffd\ufffd8Q\\Gy.|\ufffd[\ufffdS\ufffdc\'x\ufffd\x7fz[\ufffd\x14\ufffd\ufffd\ufffd\u051fI\ufffd\ufffd\ufffd\ufffd$\ufffdjUm\ufffd\x1f\u025b\ufffdiv\u48b8/7!O\ubee6\ufffd\ufffd%I\ufffd\ufffdF\x10x\ufffd?\ufffdY\ufffd\ufffd\ufffdz+?\x7f\ufffd\ufffdU@\ufffd\x06dZ\x04`z5\x02\ufffd\u06c0K\ufffd\ufffdK\ufffd\ufffd\ufffdt\ufffd\u017b\ufffdf\ufffd\x07P)0\x0f\ufffd\ufffd?go\ufffdv\x0f\ufffd\ufffdI\ufffd\\\ufffd\x1e\u064b\ufffd\ufffd>\ufffdfI\ufffdfN+\x19\ufffdw\ufffdX\ufffd%\ufffdor\x1e)\ufffdm\ufffd\ufffd\ufffd\ufffd"\ufffde\ufffd\ufffd\ufffd\ufffdYv;\x12\ufffd\ufffd\x06\ufffd\ufffdy\ufffdE\ufffd\x14d\x06\ufffd\ufffd\ufffd#~Yl\ufffd%y\ufffd\ufffd\u0751\ufffdU\ufffd\ufffdn\ufffdr\ufffd\ufffdKK\ufffd\ufffd\x16\ufffd\ufffd\ufffdK\ufffd\ufffd\x08\ufffd\ufffd&\ufffd\ufffdq\rZ\ufffdk\ufffd\ufffd\x12C(\ufffd\ufffd\ufffd\x00\r9\ufffd\ufffd7\ufffd\x05hK\ufffd\ufffd\x02t\ufffd2\ufffd\x1fHU\x7fN\ufffdT\x12\ufffd!\x19(\r7\ufffd{\xc6\ufffd\ufffdG\ufffd\ufffdv\ufffd"\ufffd3\ufffd\ufffd\u03a3(\ufffd\ufffdS\ufffd\ufffd\x13\ufffd\ufffdT\ufffd(\ufffd\ufffdy\ufffd[.\ufffd\ufffd\ufffdt\ufffd;\ufffd\ufffdw\ufffd\ufffdV\ufffd\ufffd\ufffd\ufffdV;\u22ce\ufffd\ufffd\ufffd\ufffd]W]\u07ed\ufffd\x16o w[\x12\ufffdKn\u02ae%\ufffdw\ufffd\ufffdrA\ufffd\r\ufffd\x07\ufffd\n\ufffd\x13a\ua148^\ufffd[\ufffd\ubdc8]\ufffd\ufffd2\x01\ufffd\ufffd+\ufffdBQZ\x10\ufffd\ufffdEh\ufffd3\ufffd\ufffd\x0e\ufffd\u04b0\x08\x1d\x17\ufffd\ufffd\ufffd\ufffd\ufffd\x0c\ufffdC4~$\ufffd\ufffd\ufffd?\ufffdv*\x15\x07n\ufffdb\ufffd0\ufffd\ufffd\x18\ufffd\ufffdW\x1f\ufffd\ufffd"\ufffd4\ufffd\ufffd<\ufffdY*\ufffd\x18\ufffdM\xa2\u0526\ufffdp\ufffd\ufffd4U\ufffd\ufffd*I\x13\u0567\ufffd1\ufffdH>\x16\x1b0\ufffd\ufffdh\x16$\u015e\ufffd\ufffd\ufffdEq\ufffd\ufffd\ufffd@{\x08WC\x07\ufffd\x05]\x04\ufffdN\ufffd\ufffdL\u043e\ufffd7\x7f\x0c4n\ufffds\ufffds#\x19Q\x7fL'

In [85]:
r = requests.get(DATA_URL, stream=True)

In [86]:
raw_lines = r.raw.read()

In [87]:
fileobj = StrIO(raw_lines)

In [88]:
import gzip

In [89]:
lines = gzip.GzipFile(fileobj=fileobj)

In [90]:
lines


Out[90]:
<gzip StringIO.StringIO instance at 0x7f42613dc7a0 0x7f42615d1d50>

In [91]:
lines = list(lines)

In [ ]:
lines = [line.decode('utf-8').strip() for line in lines]

In [ ]:
for line in lines:
    print(line)

In [ ]:
clean_lines = []
for line in lines:
    line=line.rstrip().split('|')
    clean_lines.append(line)

In [ ]:
clean_lines[:10]

Pandas DataFrame


In [ ]:
import pandas as pd

In [ ]:
df = pd.DataFrame(clean_lines)

In [ ]:
df.info()

In [ ]:
columns = ['info_hash', 'name', 'category', 'info_url','download_url',
           'size','category_id','files_count','seeders','leechers','upload_date']

df.columns = columns

In [ ]:
df.head()

In [ ]:
category_count = df['category'].value_counts()

In [ ]:
category_count

In [ ]:
import seaborn as sns

In [ ]:
sns.barplot('category', data=df);

Making it SFW


In [ ]:
df = df[df.category != 'XXX']

In [ ]:
df.ix[1:20,'name']

In [ ]:
dfs = df['name'].value_counts() > 1

In [ ]:
dfs.head()

In [ ]:
dfs = dfs.sort_index()
dfs.ix[7:17]

In [ ]:
dfx = df.drop_duplicates(subset=['name']).sort('name')

In [ ]:
len(dfx)

In [ ]:
dfx[dfs.values]

In [ ]:
len(dfx[dfs.values])

In [ ]:
subset = dfx[dfs.values]

In [ ]:
sns.barplot('category',data=subset)

String lookups


In [ ]:
df.name

In [ ]:
df[df['name'].str.contains("Principles")]

In [ ]:
df['name'].apply(lambda x: unicode(x))

In [ ]:
df['name'].str.lower().str.split()

In [ ]:
from collections import Counter

In [ ]:
result = Counter(" ".join(df['name'].values.tolist()).split(" ")).items()

In [ ]:
result.sort()

In [ ]:
result

In [ ]:
word_count = pd.DataFrame(result, columns=['token','count'])

In [ ]:
word_count

In [ ]:
word_count.sort('count',ascending=False)

In [ ]:
select = word_count['token'].str.len() > 8

In [ ]:
word_count[select]

In [ ]:
word_count[select].sort('count',ascending=False)[1:110]

In [ ]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [ ]:
import numpy as np
word_count['has_digits'] = word_count['token'].apply(hasNumbers)

In [ ]:
word_count['has_digits'].value_counts()

In [ ]:
word_count[-word_count['has_digits']]

In [ ]:
punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''

In [ ]:
def hasPunctuation(inputString):
    return any(chat in punctuations for char in inputString)