In [1]:
#Import libraries
import pandas as pd
from scipy.spatial.distance import cosine
In [2]:
data = pd.read_csv("../data/groceries.csv")
In [3]:
data.head(100)
Out[3]:
In [4]:
#Assume that for all items only one quantity was bought
Exercise 1 Add a column to data : Quantity
that has value 1
In [5]:
data["Quantity"] = 1
In [6]:
data.head()
Out[6]:
In [7]:
len(pd.unique(data.item))
Out[7]:
In [8]:
#This particular view isn't very helpful for us for analysis.
#This way of data being arranged is called LONG
#We need it in wide format
In [9]:
#Converting data from long to wide format
dataWide = data.pivot("Person", "item", "Quantity")
In [10]:
dataWide.head()
Out[10]:
Exercise 2 Print the data for Person number 2
In [11]:
dataWide[dataWide.index==2]
Out[11]:
In [12]:
dataWide.iloc[1:2,:]
Out[12]:
In [13]:
dataWide.loc[2,:]
Out[13]:
Exercise 3 Print the data for row number 2
In [14]:
dataWide.iloc[1,:]
Out[14]:
In [15]:
#Replace NA with 0
dataWide.fillna(0, inplace=True)
In [16]:
dataWide.head()
Out[16]:
In [17]:
#Drop the Person column
data_ib = dataWide.copy()
In [18]:
data_ib.head()
Out[18]:
In [19]:
data_ib = data_ib.reset_index()
In [20]:
data_ib.head()
Out[20]:
In [21]:
#Drop the Person column
#data_ib = data_ib.iloc[:,1:]
data_ib = data_ib.drop("Person", axis=1)
In [22]:
data_ib.head()
Out[22]:
In [23]:
# Create a placeholder dataframe listing item vs. item
data_ibs = pd.DataFrame(index=data_ib.columns,
columns=data_ib.columns)
In [24]:
data_ibs.head()
Out[24]:
We will now find similarities.
We will use cosine similarity
The resulting similarity ranges from −1 meaning exactly opposite, to 1 meaning exactly the same, with 0 indicating orthogonality (decorrelation), and in-between values indicating intermediate similarity or dissimilarity.
src https://en.wikipedia.org/wiki/Cosine_similarity
In essense the cosine similarity takes the sum product of the first and second column, then divides that by the product of the square root of the sum of squares of each column.
In [25]:
for i in range(0,len(data_ibs.columns)) :
# Loop through the columns for each column
for j in range(0,len(data_ibs.columns)) :
# Fill in placeholder with cosine similarities
data_ibs.ix[i,j] = 1-cosine(data_ib.ix[:,i],data_ib.ix[:,j])
In [26]:
data_ibs.head()
Out[26]:
With our similarity matrix filled out we can look for each items “neighbour” by looping through ‘data_ibs’, sorting each column in descending order, and grabbing the name of each of the top 3 products.
In [27]:
data_neighbours = pd.DataFrame(index=data_ibs.columns,columns=range(1,4))
# Loop through our similarity dataframe and fill in neighbouring item names
for i in range(0,len(data_ibs.columns)):
data_neighbours.ix[i,:3] = data_ibs.ix[0:,i].sort_values(ascending=False)[:3].index
In [28]:
data_neighbours
Out[28]:
Exercise 4 Modify the above code to print the top 10 similar products for each product
In [29]:
data_neighbours = pd.DataFrame(index=data_ibs.columns,columns=range(1,11))
# Loop through our similarity dataframe and fill in neighbouring item names
for i in range(0,len(data_ibs.columns)):
data_neighbours.ix[i,:10] = data_ibs.ix[0:,i].sort_values(ascending=False)[:10].index
data_neighbours
Out[29]:
The process for creating a User Based recommendation system is as follows:
In [30]:
#Helper function to get similarity scores
def getScore(history, similarities):
return sum(history*similarities)/sum(similarities)
#Understand what this function does !
In [31]:
data_sims1 = dataWide.reset_index()
In [32]:
data_sims1.head()
Out[32]:
In [33]:
# Create a place holder matrix for similarities, and fill in the user name column
data_sims = pd.DataFrame(index=data_sims1.index,columns=data_sims1.columns)
data_sims.ix[:,:1] = data_sims1.ix[:,:1]
In [34]:
#This is the same as our original data but with nothing filled in except the headers
data_sims.head()
Out[34]:
In [35]:
data_sims12 = data_sims1.iloc[:500,:]
In [36]:
data_sims11 = data_sims.iloc[:500,:]
In [37]:
for i in range(0,len(data_sims11.index)):
for j in range(1,len(data_sims11.columns)):
user = data_sims11.index[i]
product = data_sims11.columns[j]
if data_sims12.ix[i][j] == 1:
data_sims11.ix[i][j] = 0
else:
product_top_names = data_neighbours.ix[product][1:10]
product_top_sims = data_ibs.ix[product].sort_values(ascending=False)[1:10]
user_purchases = data_ib.ix[user,product_top_names]
data_sims11.ix[i][j] = getScore(user_purchases,product_top_sims)
print i
In [38]:
# Get the top products
data_recommend = pd.DataFrame(index=data_sims.index, columns=['Person','1','2','3','4','5','6'])
data_recommend.ix[0:,0] = data_sims.ix[:,0]
In [39]:
# Instead of top product scores, we want to see names
for i in range(0,len(data_sims.index)):
data_recommend.ix[i,1:] = data_sims.ix[i,:].sort_values(ascending=False).ix[1:7,].index.transpose()
In [40]:
# Print a sample
data_recommend.ix[:10,:4]
Out[40]:
This case/code was inspired from http://www.salemmarafi.com/code/collaborative-filtering-with-python/
Look into that link for more information
More links: http://blogs.gartner.com/martin-kihn/how-to-build-a-recommender-system-in-python/
In [ ]: