In [109]:
from pandas import DataFrame, read_csv
import numpy as np
import pandas as pd
from sklearn import cross_validation
from sklearn import linear_model

# General syntax to import a library but no functions: 
##import (library) as (give the library a nickname/alias)
import matplotlib.pyplot as plt
import pandas as pd #this is how I usually import pandas
import sys #only needed to determine Python version number

# Enable inline plotting
%matplotlib inline

print 'Python version ' + sys.version
print 'Pandas version ' + pd.__version__


Python version 2.7.9 |Anaconda 2.2.0 (64-bit)| (default, Dec 18 2014, 16:57:52) [MSC v.1500 64 bit (AMD64)]
Pandas version 0.15.2

In [6]:
df = pd.read_csv('train.csv')
df[1:20]


Out[6]:
TripType VisitNumber Weekday Upc ScanCount DepartmentDescription FinelineNumber
1 30 7 Friday 60538815980 1 SHOES 8931
2 30 7 Friday 7410811099 1 PERSONAL CARE 4504
3 26 8 Friday 2238403510 2 PAINT AND ACCESSORIES 3565
4 26 8 Friday 2006613744 2 PAINT AND ACCESSORIES 1017
5 26 8 Friday 2006618783 2 PAINT AND ACCESSORIES 1017
6 26 8 Friday 2006613743 1 PAINT AND ACCESSORIES 1017
7 26 8 Friday 7004802737 1 PAINT AND ACCESSORIES 2802
8 26 8 Friday 2238495318 1 PAINT AND ACCESSORIES 4501
9 26 8 Friday 2238400200 -1 PAINT AND ACCESSORIES 3565
10 26 8 Friday 5200010239 1 DSD GROCERY 4606
11 26 8 Friday 88679300501 2 PAINT AND ACCESSORIES 3504
12 26 8 Friday 22006000000 1 MEAT - FRESH & FROZEN 6009
13 26 8 Friday 2236760452 1 PAINT AND ACCESSORIES 7
14 26 8 Friday 88679300501 -1 PAINT AND ACCESSORIES 3504
15 26 8 Friday 2238400200 2 PAINT AND ACCESSORIES 3565
16 26 8 Friday 3019294203 1 PAINT AND ACCESSORIES 2801
17 26 8 Friday 72450408840 1 PAINT AND ACCESSORIES 1028
18 26 8 Friday 25541500000 2 DAIRY 1305
19 26 8 Friday 2310010776 1 PETS AND SUPPLIES 3300

In [72]:
df['Upc'][:200].plot(kind='hist',bins=100)


Out[72]:
<matplotlib.axes._subplots.AxesSubplot at 0x113d7828>

In [70]:
df = df[np.isfinite(df['Upc'])]
df[:20]


Out[70]:
TripType VisitNumber Weekday Upc ScanCount DepartmentDescription FinelineNumber
0 999 5 Friday 68113152929 -1 FINANCIAL SERVICES 1000
1 30 7 Friday 60538815980 1 SHOES 8931
2 30 7 Friday 7410811099 1 PERSONAL CARE 4504
3 26 8 Friday 2238403510 2 PAINT AND ACCESSORIES 3565
4 26 8 Friday 2006613744 2 PAINT AND ACCESSORIES 1017
5 26 8 Friday 2006618783 2 PAINT AND ACCESSORIES 1017
6 26 8 Friday 2006613743 1 PAINT AND ACCESSORIES 1017
7 26 8 Friday 7004802737 1 PAINT AND ACCESSORIES 2802
8 26 8 Friday 2238495318 1 PAINT AND ACCESSORIES 4501
9 26 8 Friday 2238400200 -1 PAINT AND ACCESSORIES 3565
10 26 8 Friday 5200010239 1 DSD GROCERY 4606
11 26 8 Friday 88679300501 2 PAINT AND ACCESSORIES 3504
12 26 8 Friday 22006000000 1 MEAT - FRESH & FROZEN 6009
13 26 8 Friday 2236760452 1 PAINT AND ACCESSORIES 7
14 26 8 Friday 88679300501 -1 PAINT AND ACCESSORIES 3504
15 26 8 Friday 2238400200 2 PAINT AND ACCESSORIES 3565
16 26 8 Friday 3019294203 1 PAINT AND ACCESSORIES 2801
17 26 8 Friday 72450408840 1 PAINT AND ACCESSORIES 1028
18 26 8 Friday 25541500000 2 DAIRY 1305
19 26 8 Friday 2310010776 1 PETS AND SUPPLIES 3300

In [26]:
Dpts = df['DepartmentDescription'].unique()
Wkds = df['Weekday'].unique()


Out[26]:
array(['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday'], dtype=object)

In [27]:
Dpts


Out[27]:
array(['FINANCIAL SERVICES', 'SHOES', 'PERSONAL CARE',
       'PAINT AND ACCESSORIES', 'DSD GROCERY', 'MEAT - FRESH & FROZEN',
       'DAIRY', 'PETS AND SUPPLIES', 'HOUSEHOLD CHEMICALS/SUPP',
       'IMPULSE MERCHANDISE', 'PRODUCE', 'CANDY TOBACCO COOKIES',
       'GROCERY DRY GOODS', 'BOYS WEAR', 'FABRICS AND CRAFTS',
       'JEWELRY AND SUNGLASSES', 'MENS WEAR', 'ACCESSORIES',
       'HOME MANAGEMENT', 'FROZEN FOODS', 'SERVICE DELI',
       'INFANT CONSUMABLE HARDLINES', 'PRE PACKED DELI', 'COOK AND DINE',
       'PHARMACY OTC', 'LADIESWEAR', 'COMM BREAD', 'BAKERY',
       'HOUSEHOLD PAPER GOODS', 'CELEBRATION', 'HARDWARE', 'BEAUTY',
       'AUTOMOTIVE', 'BOOKS AND MAGAZINES', 'SEAFOOD', 'OFFICE SUPPLIES',
       'LAWN AND GARDEN', 'SHEER HOSIERY', 'WIRELESS', 'BEDDING',
       'BATH AND SHOWER', 'HORTICULTURE AND ACCESS', 'HOME DECOR', 'TOYS',
       'INFANT APPAREL', 'LADIES SOCKS', 'PLUS AND MATERNITY',
       'ELECTRONICS', 'GIRLS WEAR 4-6X  AND 7-14', 'BRAS & SHAPEWEAR',
       'LIQUOR WINE BEER', 'SLEEPWEAR/FOUNDATIONS', 'CAMERAS AND SUPPLIES',
       'SPORTING GOODS', 'PLAYERS AND ELECTRONICS', 'PHARMACY RX',
       'MENSWEAR', 'OPTICAL - FRAMES', 'SWIMWEAR/OUTERWEAR',
       'OTHER DEPARTMENTS', 'MEDIA AND GAMING', 'FURNITURE',
       'OPTICAL - LENSES', 'SEASONAL', 'LARGE HOUSEHOLD GOODS',
       '1-HR PHOTO', 'CONCEPT STORES', 'HEALTH AND BEAUTY AIDS'], dtype=object)

In [29]:
Wkds


Out[29]:
array(['Friday', 'Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday',
       'Thursday'], dtype=object)

In [48]:
df2 = df.copy()

In [74]:
for i in range(Dpts.size):
    df2 = df2.replace(Dpts[i],i)
for i in range(Wkds.size):
    df2 = df2.replace(Wkds[i],i)

In [85]:
Data = df2.as_matrix()

In [89]:
Data.shape


Out[89]:
(642925L, 7L)

In [101]:
Ytrain = Data[:,0]
Xtrain = Data[:,1:]

In [ ]:
Xtrain.shape

In [ ]:
logreg = linear_model.LogisticRegression(C=1e5)

# we create an instance of Neighbours Classifier and fit the data.

scores = cross_validation.cross_val_score(logreg, Xtrain, Ytrain, cv=5)

In [ ]:
scores