Recipe websites allow you to bookmark certain recipes as "favourites". A student named Jeremy Cohen pulled together a sample of such data for an excellent machine learning project and we'll use his dataset to demo how to do some unsupervised machine learning with MLDB.
The notebook cells below use pymldb
's Connection
class to make REST API calls. You can check out the Using pymldb
Tutorial for more details.
In [13]:
from pymldb import Connection
mldb = Connection("http://localhost/")
The sequence of procedures below is based on the one explained in the Mapping Reddit demo notebook.
First we import the raw data and make a sparse matrix out of it.
In [14]:
print mldb.put('/v1/procedures/import_rcp', {
"type": "import.text",
"params": {
"headers": ["user_id", "recipe_id"],
"dataFileUrl": "http://public.mldb.ai/favorites.csv.gz",
"outputDataset": "rcp_raw",
"runOnCreation": True
}
})
print mldb.post('/v1/procedures', {
"id": "rcp_import",
"type": "transform",
"params": {
"inputData": "select pivot(recipe_id, 1) as * named user_id from rcp_raw group by user_id",
"outputDataset": "recipes",
"runOnCreation": True
}
})
We then train an SVD decomposition and do K-Means clustering
In [76]:
print mldb.post('/v1/procedures', {
"id": "rcp_svd",
"type" : "svd.train",
"params" : {
"trainingData": "select * from recipes",
"columnOutputDataset" : "rcp_svd_embedding_raw",
"runOnCreation": True
}
})
num_centroids = 16
print mldb.post('/v1/procedures', {
"id" : "rcp_kmeans",
"type" : "kmeans.train",
"params" : {
"trainingData" : "select * from rcp_svd_embedding_raw",
"outputDataset" : "rcp_kmeans_clusters",
"centroidsDataset" : "rcp_kmeans_centroids",
"numClusters" : num_centroids,
"runOnCreation": True
}
})
Now we import the actual recipe names, clean them up a bit, and get a version of our SVD embedding with the recipe names as column names.
In [77]:
print mldb.put('/v1/procedures/import_rcp_names_raw', {
'type': 'import.text',
'params': {
'dataFileUrl': 'http://public.mldb.ai/recipes.csv.gz',
'outputDataset': "rcp_names_raw",
'delimiter':'',
'quoteChar':'',
'runOnCreation': True
}
})
print mldb.put('/v1/procedures/rcp_names_import', {
'type': 'transform',
'params': {
'inputData': '''
select jseval(
'return s.substr(s.indexOf(",") + 1)
.replace(/"/g, "")
.replace(/®/g, "");',
's', lineText) as name
named implicit_cast(rowName()) - 1
from rcp_names_raw
''',
'outputDataset': 'rcp_names',
'runOnCreation': True
}
})
print mldb.put('/v1/procedures/rcp_clean_svd', {
'type': 'transform',
'params': {
'inputData': """
select rcp_svd_embedding_raw.* as *
named rcp_names.rowName()+'-'+rcp_names.name
from rcp_svd_embedding_raw
join rcp_names on (rcp_names.rowName() = rcp_svd_embedding_raw.rowPathElement(0))
""",
'outputDataset': {'id': 'rcp_svd_embedding',
'type': 'embedding',
'params': {'metric': 'cosine'}},
'runOnCreation': True
}
})
With all that pre-processing done, let's look at the names of the 3 closest recipes to each cluster centroid to try to get a sense of what kind of clusters we got.
In [78]:
mldb.put("/v1/functions/nearestRecipe", {
"type":"embedding.neighbors",
"params": { "dataset": "rcp_svd_embedding", "defaultNumNeighbors": 3 }
})
mldb.query("""
select nearestRecipe({coords: {*}})[neighbors] as * from rcp_kmeans_centroids
""").applymap(lambda x: x.split('-')[1])
Out[78]:
We can see a bit of pattern just from the names of the recipes nearest to the centroids, but we can probably do better! Let's try to extract the most characteristic words used in the recipe names for each cluster.
In [84]:
print mldb.put('/v1/procedures/sum_words_per_cluster', {
'type': 'transform',
'params': {
'inputData': """
select sum({tokens.* as *}) as *
named c.cluster
from (
SELECT lower(n.name),
tokenize('recipe ' + lower(n.name), {splitChars:' -.;&!''()",', minTokenLength: 4}) as tokens,
c.cluster
FROM rcp_names as n
JOIN rcp_kmeans_clusters as c ON (n.rowName() = c.rowPathElement(0))
order by n.rowName()
)
group by c.cluster
""",
'outputDataset': 'rcp_cluster_word_counts',
'runOnCreation': True
}
})
mldb.query("""select * from rcp_cluster_word_counts order by implicit_cast(rowName())""")
Out[84]:
We can use this to create a TF-IDF score for each word in the cluster. Basically this score will give us an idea of the relative importance of a each word in a given cluster.
In [82]:
print mldb.put('/v1/procedures/train_tfidf', {
'type': 'tfidf.train',
'params': {
'trainingData': "select * from rcp_cluster_word_counts",
'modelFileUrl': 'file:///mldb_data/models/rcp_tfidf.idf',
'runOnCreation': True
}
})
print mldb.put('/v1/functions/rcp_tfidf', {
'type': 'tfidf',
'params': {
'modelFileUrl': 'file:///mldb_data/models/rcp_tfidf.idf',
'tfType': 'log', 'idfType': 'inverse'
}
})
print mldb.put('/v1/procedures/apply_tfidf', {
'type': 'transform',
'params': {
'inputData': "select rcp_tfidf({input: {*}})[output] as * from rcp_cluster_word_counts",
'outputDataset': 'rcp_cluster_word_scores',
'runOnCreation': True
}
})
mldb.query("select * from rcp_cluster_word_scores order by implicit_cast(rowName())")
Out[82]:
If we transpose that dataset, we will be able to get the highest scored words for each cluster, and we can display them nicely in a word cloud.
In [83]:
import json
from ipywidgets import interact
from IPython.display import IFrame, display
html = """
<script src="https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.6/d3.min.js"></script>
<script src="https://static.mldb.ai/d3.layout.cloud.js"></script>
<script src="https://static.mldb.ai/wordcloud.js"></script>
<body> <script>drawCloud(%s)</script> </body>
"""
@interact
def cluster_word_cloud(cluster=[0, num_centroids-1]):
num_words = 20
cluster_words = mldb.get(
'/v1/query',
q="""
SELECT rowName() as text
FROM transpose(rcp_cluster_word_scores)
ORDER BY "{0}" DESC
LIMIT {1}
""".format(cluster, num_words),
format='aos',
rowNames=0
).json()
for i,x in enumerate(cluster_words):
x['size'] = num_words - i
display( IFrame("data:text/html," + (html % json.dumps(cluster_words)).replace('"',"'"), 850, 350) )
Much better!
Check out the other Tutorials and Demos.