In [2]:
# 1. Import sklearn, import pandas as pd, and pd.read_csv the CFPB CSV file into dataframe 'df'.

import sklearn
import pandas as pd

df = pd.read_csv('data/cfpb_complaints_with_fictitious_data.csv')

In [3]:
# 2. Filter your df down to 'Product', 'Consumer Claim', 'Amount Received' using [[]] notation. Which is our target?

df = df[['Product', 'Consumer Claim', 'Amount Received']]
df.head(5) # Our target is "Product"


Out[3]:
Product Consumer Claim Amount Received
0 Credit card 332.63 130.22
1 Debt collection 54.79 49.14
2 Credit card 215.04 155.28
3 Credit reporting 3.31 4.59
4 Debt collection 73.99 46.02

In [6]:
# 3. From sklearn.cross_validation import train_test_split. Make a train/test split 80/20 (we won't use it though).

from sklearn.cross_validation import train_test_split

train, test = train_test_split(df, test_size=.2)
print(len(train))
print(len(test))


100544
25136

In [7]:
# 4. Assign df[['Consumer Claim', 'Amount Received']] to 'X'

X = df[['Consumer Claim', 'Amount Received']]

In [8]:
# 5. Convert to raw values df['Product'].values and assign to 'y'

y = df['Product']

In [9]:
# 6. From sklearn.preprocessing import StandardScaler. From sklearn.pipeline import Pipeline.

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [11]:
# 7. From sklearn.neighbors import KNeighborsClassifier. Make a scalar/knn pipeline.

from sklearn.neighbors import KNeighborsClassifier

pipe = Pipeline([('scaler', StandardScaler()),
                 ('classifier', KNeighborsClassifier())])

In [12]:
# 8. Fit your pipeline with your X and y.

pipe.fit(X, y)


Out[12]:
Pipeline(steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('classifier', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))])

In [13]:
# 9. Use your newly fitted pipeline to predict classifications for [[100, 80], [5000, 4000], [350, 900]] .

pipe.predict([[100, 80], [5000, 4000], [350, 900]])


Out[13]:
array(['Money transfers', 'Consumer Loan', 'Other financial service'], dtype=object)

In [16]:
# 10. From sklearn.cross_validation import cross_val_score. Run cross val score on your pipeline.

from sklearn.cross_validation import cross_val_score

scores = cross_val_score(pipe, X, y)
scores


Out[16]:
array([ 0.89779942,  0.89747691,  0.89510373])

In [15]:
# 11. Get the mean of cross validation scores from your pipeline,

score_mean = scores.mean()
score_mean


Out[15]:
0.89679334986607062

In [ ]:
# 12. Now repeat with Support Vector Machine Classifier (sklearn.svm.SVC) pipeline. Which yields better results?

from sklearn.svm import SVC

pipe = Pipeline([('scaler', StandardScaler()),
                 ('classifier', SVC())])

pipe.fit(X, y)

print(pipe.predict([[100, 80], [5000, 4000], [350, 900]]))

scores = cross_val_score(pipe, X, y)
print(scores)

print(scores.mean())


['Debt collection' 'Consumer Loan' 'Debt collection']