In [1]:
%matplotlib inline

Fetching the data

A simple HTTP request


In [2]:
import requests

print requests.get("http://example.com").text


<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 50px;
        background-color: #fff;
        border-radius: 1em;
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        body {
            background-color: #fff;
        }
        div {
            width: auto;
            margin: 0 auto;
            border-radius: 0;
            padding: 1em;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>
    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

Communicating with APIs


In [3]:
response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item['volumeInfo']['title'] for item in raw_data['items']]
titles


Out[3]:
[u'C4.5',
 u'Machine Learning',
 u'Machine Learning',
 u'Machine Learning',
 u'A First Course in Machine Learning',
 u'Machine Learning',
 u'Elements of Machine Learning',
 u'Introduction to Machine Learning',
 u'Pattern Recognition and Machine Learning',
 u'Machine Learning and Its Applications']

In [4]:
import lxml.html

page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don't sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
    links = el.find_class("item_link")
    images = el.find_class("item_image")
    prices = el.find_class("list_price")
    if links and images and prices and prices[0].text:
        items_data.append({"name": links[0].text,
                           "image": images[0].attrib['src'],
                           "price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data


Out[4]:
[{'image': 'http://cdn.blocket.com/static/2/lithumbs/98/9864322297.jpg',
  'name': 'Macbook laddare 60w',
  'price': 250},
 {'image': 'http://cdn.blocket.com/static/2/lithumbs/43/4338840758.jpg',
  'name': u'Apple iPhone 5S 16GB - Ol\xe5st - 12 m\xe5n garanti',
  'price': 3999},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/98/9838946223.jpg',
  'name': u'Ol\xe5st iPhone 5 64 GB med n\xe4stan nytt batteri',
  'price': 3000},
 {'image': 'http://cdn.blocket.com/static/1/lithumbs/79/7906971367.jpg',
  'name': u'Apple iPhone 5C 16GB - Ol\xe5st - 12 m\xe5n garanti',
  'price': 3099},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/79/7926951568.jpg',
  'name': u'HP Z620 Workstation - 1 \xe5rs garanti',
  'price': 12494},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/97/9798755036.jpg',
  'name': 'HP ProBook 6450b - Andrasortering',
  'price': 1699},
 {'image': 'http://cdn.blocket.com/static/1/lithumbs/98/9898462036.jpg',
  'name': 'Macbook pro 13 retina, 256 gb ssd',
  'price': 12000}]

Reading local data


In [5]:
import pandas

df = pandas.read_csv('sample.csv')

In [6]:
# Display the DataFrame
df


Out[6]:
Year Make Model Description Price
0 1997 Ford E350 ac, abs, moon 3000
1 1999 Chevy Venture "Extended Edition" NaN 4900
2 1999 Chevy Venture "Extended Edition, Very Large" NaN 5000
3 1996 Jeep Grand Cherokee MUST SELL!\nair, moon roof, loaded NaN

In [7]:
# DataFrame's columns
df.columns


Out[7]:
Index([u'Year', u'Make', u'Model', u'Description', u'Price'], dtype='object')

In [8]:
# Values of a given column
df.Model


Out[8]:
0                                      E350
1                Venture "Extended Edition"
2    Venture "Extended Edition, Very Large"
3                            Grand Cherokee
Name: Model, dtype: object

Analyzing the dataframe


In [9]:
# Any missing values?
df['Price']


Out[9]:
0    3000
1    4900
2    5000
3     NaN
Name: Price, dtype: float64

In [10]:
df['Description']


Out[10]:
0                         ac, abs, moon
1                                   NaN
2                                   NaN
3    MUST SELL!\nair, moon roof, loaded
Name: Description, dtype: object

In [11]:
# Fill missing prices by a linear interpolation
df['Description'] = df['Description'].fillna("No description is available.")
df['Price'] = df['Price'].interpolate()

df


Out[11]:
Year Make Model Description Price
0 1997 Ford E350 ac, abs, moon 3000
1 1999 Chevy Venture "Extended Edition" No description is available. 4900
2 1999 Chevy Venture "Extended Edition, Very Large" No description is available. 5000
3 1996 Jeep Grand Cherokee MUST SELL!\nair, moon roof, loaded 5000

Exploring data


In [12]:
import matplotlib.pyplot as plt

df = pandas.read_csv('sample2.csv')

df


Out[12]:
Office Year Sales
0 Stockholm 2004 200
1 Stockholm 2005 250
2 Stockholm 2006 255
3 Stockholm 2007 260
4 Stockholm 2008 264
5 Stockholm 2009 274
6 Stockholm 2010 330
7 Stockholm 2011 364
8 New York 2004 432
9 New York 2005 469
10 New York 2006 480
11 New York 2007 438
12 New York 2008 330
13 New York 2009 280
14 New York 2010 299
15 New York 2011 230

In [13]:
# This table has 3 columns: Office, Year, Sales
print df.columns

# It's really easy to query data with Pandas:
print df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]

# It's also easy to do aggregations...
aggregated_sales = df.groupby('Year').sum()
print aggregated_sales


Index([u'Office', u'Year', u'Sales'], dtype='object')
      Office  Year  Sales
4  Stockholm  2008    264
5  Stockholm  2009    274
6  Stockholm  2010    330
7  Stockholm  2011    364
      Sales
Year       
2004    632
2005    719
2006    735
2007    698
2008    594
2009    554
2010    629
2011    594

In [14]:
# ... and generate plots
%matplotlib inline
aggregated_sales.plot(kind='bar')


Out[14]:
<matplotlib.axes._subplots.AxesSubplot at 0x1089dcc10>

Machine learning

Feature extraction


In [15]:
from sklearn import feature_extraction

Extracting features from text


In [16]:
corpus = ['All the cats really are great.',
          'I like the cats but I still prefer the dogs.',
          'Dogs are the best.',
          'I like all the trains',
          ]

tfidf = feature_extraction.text.TfidfVectorizer()

print tfidf.fit_transform(corpus).toarray()
print tfidf.get_feature_names()


[[ 0.38761905  0.38761905  0.          0.          0.38761905  0.
   0.49164562  0.          0.          0.49164562  0.          0.25656108
   0.        ]
 [ 0.          0.          0.          0.4098205   0.32310719  0.32310719
   0.          0.32310719  0.4098205   0.          0.4098205   0.42772268
   0.        ]
 [ 0.          0.4970962   0.6305035   0.          0.          0.4970962
   0.          0.          0.          0.          0.          0.32902288
   0.        ]
 [ 0.4970962   0.          0.          0.          0.          0.          0.
   0.4970962   0.          0.          0.          0.32902288  0.6305035 ]]
[u'all', u'are', u'best', u'but', u'cats', u'dogs', u'great', u'like', u'prefer', u'really', u'still', u'the', u'trains']

Dict vectorizer


In [17]:
import json


data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
        {"weight": 60., "sex": 'female', "student": True},
        {"weight": 80.1, "sex": 'male', "student": False},
        {"weight": 65.3, "sex": 'male', "student": True},
        {"weight": 58.5, "sex": 'female', "student": False}]

vectorizer = feature_extraction.DictVectorizer(sparse=False)

vectors = vectorizer.fit_transform(data)
print vectors
print vectorizer.get_feature_names()


[[   1.     0.     1.   194. ]
 [   1.     0.     1.    60. ]
 [   0.     1.     0.    80.1]
 [   0.     1.     1.    65.3]
 [   1.     0.     0.    58.5]]
[u'sex=female', 'sex=male', u'student', u'weight']

In [18]:
class A:
    def __init__(self, x):
        self.x = x
        self.blabla = 'test'
        
a = A(20)
a.__dict__


Out[18]:
{'blabla': 'test', 'x': 20}

Pre-processing

Scaling

In [19]:
from sklearn import preprocessing

data = [[10., 2345., 0., 2.],
        [3., -3490., 0.1, 1.99],
        [13., 3903., -0.2, 2.11]]

print preprocessing.normalize(data)


[[  4.26435200e-03   9.99990544e-01   0.00000000e+00   8.52870400e-04]
 [  8.59598396e-04  -9.99999468e-01   2.86532799e-05   5.70200269e-04]
 [  3.33075223e-03   9.99994306e-01  -5.12423421e-05   5.40606709e-04]]
Dimensionality reduction

In [20]:
from sklearn import decomposition

data = [[0.3, 0.2, 0.4,  0.32],
        [0.3, 0.5, 1.0, 0.19],
        [0.3, -0.4, -0.8, 0.22]]

pca = decomposition.PCA()
print pca.fit_transform(data)
print pca.explained_variance_ratio_


[[ -2.23442295e-01  -7.71447891e-02   8.06250485e-17]
 [ -8.94539226e-01   5.14200202e-02   8.06250485e-17]
 [  1.11798152e+00   2.57247689e-02   8.06250485e-17]]
[  9.95611223e-01   4.38877684e-03   9.24548594e-33]

Machine learning models

Classification (SVM)


In [21]:
from sklearn import datasets
from sklearn import svm

In [22]:
iris = datasets.load_iris()

X = iris.data[:, :2]
y = iris.target

# Training the model
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)

# Doing predictions
new_data = [[4.85, 3.1], [5.61, 3.02]]
print clf.predict(new_data)


[0 1]

Regression (linear regression)


In [23]:
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

def f(x):
    return x + np.random.random() * 3.

X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = map(f, X)

clf = linear_model.LinearRegression()
clf.fit(X, y)


Out[23]:
LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [24]:
new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)

plt.scatter(X, y, color='g', label='Training data')

plt.plot(new_X, new_y, '.-', label='Predicted')
plt.legend()


Out[24]:
<matplotlib.legend.Legend at 0x10a38f290>

Clustering (DBScan)


In [25]:
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.4,
                            random_state=0)
X = StandardScaler().fit_transform(X)

In [26]:
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_


Out[26]:
array([-1,  0,  2,  1,  1,  2, -1,  0,  0, -1, -1,  0,  0,  2, -1, -1,  2,
        0,  1,  0,  0,  2, -1, -1,  0, -1, -1,  1, -1,  2,  1, -1,  1, -1,
        1,  0,  1,  0,  0,  2,  2, -1,  2,  1,  0,  1,  0,  1,  2,  1,  1,
        2, -1,  2,  1, -1,  0,  0, -1,  1,  0,  0,  1,  2,  0, -1,  2,  1,
       -1,  0,  0,  1,  1,  0, -1,  2, -1,  1,  2,  2,  0,  2,  1,  0, -1,
        0,  2,  1, -1,  2,  0, -1,  1,  1,  2,  0,  2,  1,  2,  1,  2,  2,
       -1,  2,  0,  1,  0, -1,  2,  0,  1,  0,  0, -1,  1,  0,  2,  2,  0,
        1,  0, -1,  1,  0,  1,  1,  1, -1,  1,  2,  1, -1, -1,  0,  0,  2,
        1,  1, -1,  0,  1,  2,  1,  0,  0, -1,  2,  1,  1,  1,  2,  2,  0,
        0,  2, -1,  1,  0,  1,  1,  2,  1,  2,  1,  0, -1,  2,  0,  2,  1,
        2,  1,  0,  1,  2,  0,  1, -1,  2,  0,  0,  1,  1,  1, -1,  0,  1,
        0,  1,  2, -1, -1,  2,  1,  0,  0,  2, -1,  2,  0])

In [27]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=db.labels_)


Out[27]:
<matplotlib.collections.PathCollection at 0x10a6bc110>

Cross-validation


In [28]:
from sklearn import svm, cross_validation, datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

model = svm.SVC()
print cross_validation.cross_val_score(model, X, y, scoring='precision')
print cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error')


[ 0.98148148  0.96491228  0.98039216]
[-0.01960784 -0.03921569 -0.02083333]

Thanks for following! I hope you learned a thing or two :-)

Ping me with suggestions and questions on @halflings and/or kachkach.com