In [1]:

    
%matplotlib inline

Fetching the data

A simple HTTP request



In [2]:

    
import requests

print requests.get("http://example.com").text









    



<!doctype html>
<html>
<head>
    <title>Example Domain</title>

    <meta charset="utf-8" />
    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1" />
    <style type="text/css">
    body {
        background-color: #f0f0f2;
        margin: 0;
        padding: 0;
        font-family: "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;
        
    }
    div {
        width: 600px;
        margin: 5em auto;
        padding: 50px;
        background-color: #fff;
        border-radius: 1em;
    }
    a:link, a:visited {
        color: #38488f;
        text-decoration: none;
    }
    @media (max-width: 700px) {
        body {
            background-color: #fff;
        }
        div {
            width: auto;
            margin: 0 auto;
            border-radius: 0;
            padding: 1em;
        }
    }
    </style>    
</head>

<body>
<div>
    <h1>Example Domain</h1>
    <p>This domain is established to be used for illustrative examples in documents. You may use this
    domain in examples without prior coordination or asking for permission.</p>
    <p><a href="http://www.iana.org/domains/example">More information...</a></p>
</div>
</body>
</html>

Communicating with APIs



In [3]:

    
response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item['volumeInfo']['title'] for item in raw_data['items']]
titles









    Out[3]:





[u'C4.5',
 u'Machine Learning',
 u'Machine Learning',
 u'Machine Learning',
 u'A First Course in Machine Learning',
 u'Machine Learning',
 u'Elements of Machine Learning',
 u'Introduction to Machine Learning',
 u'Pattern Recognition and Machine Learning',
 u'Machine Learning and Its Applications']



In [4]:

    
import lxml.html

page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don't sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
    links = el.find_class("item_link")
    images = el.find_class("item_image")
    prices = el.find_class("list_price")
    if links and images and prices and prices[0].text:
        items_data.append({"name": links[0].text,
                           "image": images[0].attrib['src'],
                           "price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data









    Out[4]:





[{'image': 'http://cdn.blocket.com/static/2/lithumbs/98/9864322297.jpg',
  'name': 'Macbook laddare 60w',
  'price': 250},
 {'image': 'http://cdn.blocket.com/static/2/lithumbs/43/4338840758.jpg',
  'name': u'Apple iPhone 5S 16GB - Ol\xe5st - 12 m\xe5n garanti',
  'price': 3999},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/98/9838946223.jpg',
  'name': u'Ol\xe5st iPhone 5 64 GB med n\xe4stan nytt batteri',
  'price': 3000},
 {'image': 'http://cdn.blocket.com/static/1/lithumbs/79/7906971367.jpg',
  'name': u'Apple iPhone 5C 16GB - Ol\xe5st - 12 m\xe5n garanti',
  'price': 3099},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/79/7926951568.jpg',
  'name': u'HP Z620 Workstation - 1 \xe5rs garanti',
  'price': 12494},
 {'image': 'http://cdn.blocket.com/static/0/lithumbs/97/9798755036.jpg',
  'name': 'HP ProBook 6450b - Andrasortering',
  'price': 1699},
 {'image': 'http://cdn.blocket.com/static/1/lithumbs/98/9898462036.jpg',
  'name': 'Macbook pro 13 retina, 256 gb ssd',
  'price': 12000}]

Reading local data



In [5]:

    
import pandas

df = pandas.read_csv('sample.csv')



In [6]:

    
# Display the DataFrame
df









    Out[6]:






  
    
      
      Year
      Make
      Model
      Description
      Price
    
  
  
    
      0
       1997
        Ford
                                         E350
                            ac, abs, moon
       3000
    
    
      1
       1999
       Chevy
                   Venture "Extended Edition"
                                      NaN
       4900
    
    
      2
       1999
       Chevy
       Venture "Extended Edition, Very Large"
                                      NaN
       5000
    
    
      3
       1996
        Jeep
                               Grand Cherokee
       MUST SELL!\nair, moon roof, loaded
        NaN



In [7]:

    
# DataFrame's columns
df.columns









    Out[7]:





Index([u'Year', u'Make', u'Model', u'Description', u'Price'], dtype='object')



In [8]:

    
# Values of a given column
df.Model









    Out[8]:





0                                      E350
1                Venture "Extended Edition"
2    Venture "Extended Edition, Very Large"
3                            Grand Cherokee
Name: Model, dtype: object

Analyzing the dataframe



In [9]:

    
# Any missing values?
df['Price']









    Out[9]:





0    3000
1    4900
2    5000
3     NaN
Name: Price, dtype: float64



In [10]:

    
df['Description']









    Out[10]:





0                         ac, abs, moon
1                                   NaN
2                                   NaN
3    MUST SELL!\nair, moon roof, loaded
Name: Description, dtype: object



In [11]:

    
# Fill missing prices by a linear interpolation
df['Description'] = df['Description'].fillna("No description is available.")
df['Price'] = df['Price'].interpolate()

df









    Out[11]:






  
    
      
      Year
      Make
      Model
      Description
      Price
    
  
  
    
      0
       1997
        Ford
                                         E350
                            ac, abs, moon
       3000
    
    
      1
       1999
       Chevy
                   Venture "Extended Edition"
             No description is available.
       4900
    
    
      2
       1999
       Chevy
       Venture "Extended Edition, Very Large"
             No description is available.
       5000
    
    
      3
       1996
        Jeep
                               Grand Cherokee
       MUST SELL!\nair, moon roof, loaded
       5000

Exploring data



In [12]:

    
import matplotlib.pyplot as plt

df = pandas.read_csv('sample2.csv')

df



In [13]:

    
# This table has 3 columns: Office, Year, Sales
print df.columns

# It's really easy to query data with Pandas:
print df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]

# It's also easy to do aggregations...
aggregated_sales = df.groupby('Year').sum()
print aggregated_sales









    



Index([u'Office', u'Year', u'Sales'], dtype='object')
      Office  Year  Sales
4  Stockholm  2008    264
5  Stockholm  2009    274
6  Stockholm  2010    330
7  Stockholm  2011    364
      Sales
Year       
2004    632
2005    719
2006    735
2007    698
2008    594
2009    554
2010    629
2011    594



In [14]:

    
# ... and generate plots
%matplotlib inline
aggregated_sales.plot(kind='bar')









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x1089dcc10>

Machine learning

Feature extraction



In [15]:

    
from sklearn import feature_extraction

Extracting features from text



In [16]:

    
corpus = ['All the cats really are great.',
          'I like the cats but I still prefer the dogs.',
          'Dogs are the best.',
          'I like all the trains',
          ]

tfidf = feature_extraction.text.TfidfVectorizer()

print tfidf.fit_transform(corpus).toarray()
print tfidf.get_feature_names()









    



[[ 0.38761905  0.38761905  0.          0.          0.38761905  0.
   0.49164562  0.          0.          0.49164562  0.          0.25656108
   0.        ]
 [ 0.          0.          0.          0.4098205   0.32310719  0.32310719
   0.          0.32310719  0.4098205   0.          0.4098205   0.42772268
   0.        ]
 [ 0.          0.4970962   0.6305035   0.          0.          0.4970962
   0.          0.          0.          0.          0.          0.32902288
   0.        ]
 [ 0.4970962   0.          0.          0.          0.          0.          0.
   0.4970962   0.          0.          0.          0.32902288  0.6305035 ]]
[u'all', u'are', u'best', u'but', u'cats', u'dogs', u'great', u'like', u'prefer', u'really', u'still', u'the', u'trains']

Dict vectorizer



In [17]:

    
import json


data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
        {"weight": 60., "sex": 'female', "student": True},
        {"weight": 80.1, "sex": 'male', "student": False},
        {"weight": 65.3, "sex": 'male', "student": True},
        {"weight": 58.5, "sex": 'female', "student": False}]

vectorizer = feature_extraction.DictVectorizer(sparse=False)

vectors = vectorizer.fit_transform(data)
print vectors
print vectorizer.get_feature_names()









    



[[   1.     0.     1.   194. ]
 [   1.     0.     1.    60. ]
 [   0.     1.     0.    80.1]
 [   0.     1.     1.    65.3]
 [   1.     0.     0.    58.5]]
[u'sex=female', 'sex=male', u'student', u'weight']



In [18]:

    
class A:
    def __init__(self, x):
        self.x = x
        self.blabla = 'test'
        
a = A(20)
a.__dict__









    Out[18]:





{'blabla': 'test', 'x': 20}

Pre-processing

Scaling



In [19]:

    
from sklearn import preprocessing

data = [[10., 2345., 0., 2.],
        [3., -3490., 0.1, 1.99],
        [13., 3903., -0.2, 2.11]]

print preprocessing.normalize(data)









    



[[  4.26435200e-03   9.99990544e-01   0.00000000e+00   8.52870400e-04]
 [  8.59598396e-04  -9.99999468e-01   2.86532799e-05   5.70200269e-04]
 [  3.33075223e-03   9.99994306e-01  -5.12423421e-05   5.40606709e-04]]

Dimensionality reduction



In [20]:

    
from sklearn import decomposition

data = [[0.3, 0.2, 0.4,  0.32],
        [0.3, 0.5, 1.0, 0.19],
        [0.3, -0.4, -0.8, 0.22]]

pca = decomposition.PCA()
print pca.fit_transform(data)
print pca.explained_variance_ratio_









    



[[ -2.23442295e-01  -7.71447891e-02   8.06250485e-17]
 [ -8.94539226e-01   5.14200202e-02   8.06250485e-17]
 [  1.11798152e+00   2.57247689e-02   8.06250485e-17]]
[  9.95611223e-01   4.38877684e-03   9.24548594e-33]

Machine learning models

Classification (SVM)



In [21]:

    
from sklearn import datasets
from sklearn import svm



In [22]:

    
iris = datasets.load_iris()

X = iris.data[:, :2]
y = iris.target

# Training the model
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)

# Doing predictions
new_data = [[4.85, 3.1], [5.61, 3.02]]
print clf.predict(new_data)

Regression (linear regression)



In [23]:

    
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

def f(x):
    return x + np.random.random() * 3.

X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = map(f, X)

clf = linear_model.LinearRegression()
clf.fit(X, y)









    Out[23]:





LinearRegression(copy_X=True, fit_intercept=True, normalize=False)



In [24]:

    
new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)

plt.scatter(X, y, color='g', label='Training data')

plt.plot(new_X, new_y, '.-', label='Predicted')
plt.legend()









    Out[24]:





<matplotlib.legend.Legend at 0x10a38f290>

Clustering (DBScan)



In [25]:

    
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.4,
                            random_state=0)
X = StandardScaler().fit_transform(X)



In [26]:

    
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_









    Out[26]:





array([-1,  0,  2,  1,  1,  2, -1,  0,  0, -1, -1,  0,  0,  2, -1, -1,  2,
        0,  1,  0,  0,  2, -1, -1,  0, -1, -1,  1, -1,  2,  1, -1,  1, -1,
        1,  0,  1,  0,  0,  2,  2, -1,  2,  1,  0,  1,  0,  1,  2,  1,  1,
        2, -1,  2,  1, -1,  0,  0, -1,  1,  0,  0,  1,  2,  0, -1,  2,  1,
       -1,  0,  0,  1,  1,  0, -1,  2, -1,  1,  2,  2,  0,  2,  1,  0, -1,
        0,  2,  1, -1,  2,  0, -1,  1,  1,  2,  0,  2,  1,  2,  1,  2,  2,
       -1,  2,  0,  1,  0, -1,  2,  0,  1,  0,  0, -1,  1,  0,  2,  2,  0,
        1,  0, -1,  1,  0,  1,  1,  1, -1,  1,  2,  1, -1, -1,  0,  0,  2,
        1,  1, -1,  0,  1,  2,  1,  0,  0, -1,  2,  1,  1,  1,  2,  2,  0,
        0,  2, -1,  1,  0,  1,  1,  2,  1,  2,  1,  0, -1,  2,  0,  2,  1,
        2,  1,  0,  1,  2,  0,  1, -1,  2,  0,  0,  1,  1,  1, -1,  0,  1,
        0,  1,  2, -1, -1,  2,  1,  0,  0,  2, -1,  2,  0])



In [27]:

    
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=db.labels_)









    Out[27]:





<matplotlib.collections.PathCollection at 0x10a6bc110>

Cross-validation



In [28]:

    
from sklearn import svm, cross_validation, datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

model = svm.SVC()
print cross_validation.cross_val_score(model, X, y, scoring='precision')
print cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error')









    



[ 0.98148148  0.96491228  0.98039216]
[-0.01960784 -0.03921569 -0.02083333]

Thanks for following! I hope you learned a thing or two :-)

Ping me with suggestions and questions on @halflings and/or kachkach.com

	Year	Make	Model	Description	Price
0	1997	Ford	E350	ac, abs, moon	3000
1	1999	Chevy	Venture "Extended Edition"	NaN	4900
2	1999	Chevy	Venture "Extended Edition, Very Large"	NaN	5000
3	1996	Jeep	Grand Cherokee	MUST SELL!\nair, moon roof, loaded	NaN

	Office	Year	Sales
0	Stockholm	2004	200
1	Stockholm	2005	250
2	Stockholm	2006	255
3	Stockholm	2007	260
4	Stockholm	2008	264
5	Stockholm	2009	274
6	Stockholm	2010	330
7	Stockholm	2011	364
8	New York	2004	432
9	New York	2005	469
10	New York	2006	480
11	New York	2007	438
12	New York	2008	330
13	New York	2009	280
14	New York	2010	299
15	New York	2011	230