Basic Scikit-Learn Exercises

See also: Scikit-Learn API Reference



In [2]:

    
# 1. Import sklearn, import pandas as pd, and pd.read_csv the CFPB CSV file into dataframe 'df'.

import sklearn
import pandas as pd

df = pd.read_csv('data/cfpb_complaints_with_fictitious_data.csv')



In [3]:

    
# 2. Filter your df down to 'Product', 'Consumer Claim', 'Amount Received' using [[]] notation. Which is our target?

df = df[['Product', 'Consumer Claim', 'Amount Received']]
df.head(5) # Our target is "Product"









    Out[3]:






  
    
      
      Product
      Consumer Claim
      Amount Received
    
  
  
    
      0
      Credit card
      332.63
      130.22
    
    
      1
      Debt collection
      54.79
      49.14
    
    
      2
      Credit card
      215.04
      155.28
    
    
      3
      Credit reporting
      3.31
      4.59
    
    
      4
      Debt collection
      73.99
      46.02



In [6]:

    
# 3. From sklearn.cross_validation import train_test_split. Make a train/test split 80/20 (we won't use it though).

from sklearn.cross_validation import train_test_split

train, test = train_test_split(df, test_size=.2)
print(len(train))
print(len(test))



In [7]:

    
# 4. Assign df[['Consumer Claim', 'Amount Received']] to 'X'

X = df[['Consumer Claim', 'Amount Received']]



In [8]:

    
# 5. Convert to raw values df['Product'].values and assign to 'y'

y = df['Product']



In [9]:

    
# 6. From sklearn.preprocessing import StandardScaler. From sklearn.pipeline import Pipeline.

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



In [11]:

    
# 7. From sklearn.neighbors import KNeighborsClassifier. Make a scalar/knn pipeline.

from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([('scaler', StandardScaler()),
                 ('classifier', KNeighborsClassifier())])



In [12]:

    
# 8. Fit your pipeline with your X and y.

pipe.fit(X, y)









    Out[12]:





Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])



In [13]:

    
# 9. Use your newly fitted pipeline to predict classifications for [[100, 80], [5000, 4000], [350, 900]] .

pipe.predict([[100, 80], [5000, 4000], [350, 900]])









    Out[13]:





array(['Money transfers', 'Consumer Loan', 'Other financial service'], dtype=object)



In [16]:

    
# 10. From sklearn.cross_validation import cross_val_score. Run cross val score on your pipeline.

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(pipe, X, y)
scores









    Out[16]:





array([ 0.89779942,  0.89747691,  0.89510373])



In [15]:

    
# 11. Get the mean of cross validation scores from your pipeline,

score_mean = scores.mean()
score_mean









    Out[15]:





0.89679334986607062



In [ ]:

    
# 12. Now repeat with Support Vector Machine Classifier (sklearn.svm.SVC) pipeline. Which yields better results?

from sklearn.svm import SVC

pipe = Pipeline([('scaler', StandardScaler()),
                 ('classifier', SVC())])

pipe.fit(X, y)

print(pipe.predict([[100, 80], [5000, 4000], [350, 900]]))

scores = cross_val_score(pipe, X, y)
print(scores)

print(scores.mean())









    



['Debt collection' 'Consumer Loan' 'Debt collection']

	Product	Consumer Claim	Amount Received
0	Credit card	332.63	130.22
1	Debt collection	54.79	49.14
2	Credit card	215.04	155.28
3	Credit reporting	3.31	4.59
4	Debt collection	73.99	46.02

Basic Scikit-Learn Exercises

Next: Back To Beginning