In [13]:
%install_ext https://raw.githubusercontent.com/rasbt/python_reference/master/ipython_magic/watermark.py


Installed watermark.py. To use it, type:
  %load_ext watermark

In [14]:
%load_ext watermark

In [15]:
%watermark


18/01/2015 00:11:26

CPython 2.7.9
IPython 2.3.1

compiler   : GCC 4.2.1 (Apple Inc. build 5577)
system     : Darwin
release    : 13.4.0
machine    : x86_64
processor  : i386
CPU cores  : 4
interpreter: 64bit

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

Initialization


In [16]:
data = pd.read_csv('training.csv')

In [17]:
data_ = data.append(data)
data_ = data.append(data_)

In [19]:
print("Number of lines: {}". format(data_.shape[0]))


Number of lines: 750000

Benchmarks

Writing data


In [20]:
%%time
data_.to_csv('big_data.csv', index=False)


CPU times: user 17.9 s, sys: 487 ms, total: 18.4 s
Wall time: 18.5 s

In [21]:
!ls -lh big_data.csv


-rw-r--r-- 1 djabbz staff 210M Jan 18 00:12 big_data.csv

Loading bigger data


In [8]:
%%time
data_ = pd.read_csv('big_data.csv')


CPU times: user 5.35 s, sys: 809 ms, total: 6.16 s
Wall time: 6.21 s

Sequential computing


In [9]:
X = data.drop(['EventId', 'Weight', 'Label'], axis=1).values
y = data['Label'].values
w = data['Weight'].values

In [ ]:
%%time
randomforests = RandomForestClassifier(n_estimators=20, n_jobs=1).fit(X, y)

Parrallel computing


In [11]:
num_jobs=4

In [ ]:
%%time
randomforests = RandomForestClassifier(n_estimators=20, n_jobs=num_jobs).fit(X, y)