Load the pins collection into a DataFrame.


In [2]:
%load_ext autoreload
%autoreload 2

from pprint import pprint
import pandas as pd
import pymongo
import numpy as np
import time
import vislab

In [7]:
db = vislab.util.get_mongodb_client()['pinscraping']
print db.collection_names()


[]

In [4]:
print(db['pins'].count())
pprint(db['pins'].find_one())


3172020
{u'_id': u'www.pinterest.com/pin/100205160428494657/',
 u'board_name': u'soft-mint',
 u'caption': u'cupcakes',
 u'img': u'http://media-cache-ec0.pinimg.com/736x/cd/49/9c/cd499c1fdd84af17856e0a219031a5c4.jpg',
 u'pin_url': u'www.pinterest.com/pin/100205160428494657/',
 u'query': [u'soft'],
 u'repins': 25,
 u'repins_likes_url': [u'www.pinterest.com/pin/100205160428494657/repins/',
                       u'www.pinterest.com/pin/100205160428494657/likes/'],
 u'source': u'sprinklebakes.com',
 u'username': u'rainqueen'}

In [5]:
# Make DataFrame, and use the unique pin id as image_id.
pins_df = pd.DataFrame(list(db['pins'].find(
    fields=['img', 'query', 'repins', 'board_name', 'username', 'caption', '_id'])))
pins_df.index = pd.Index([_.split('/')[-2] for _ in pins_df['_id']], name='image_id')
del pins_df['_id']

In [6]:
# Expand the query list to boolean columns.
pins_df_expanded = pd.DataFrame(
    data=[
        dict([('query_' + _, True) for _ in row])
        for row in pins_df['query']
    ],
    index=pins_df.index
).fillna(False)
print(pins_df_expanded.sum(0))


query_animals                  124540
query_black and white          120075
query_bokeh                     56745
query_bright                    87402
query_calm                      80710
query_cloudy                    50980
query_corporate                 60794
query_depth of field            11824
query_detailed                  77693
query_energetic                 42013
query_ethereal                  70961
query_fall                     100011
query_futuristic                67141
query_geometric composition      6473
query_happy                    101777
query_hazy                      43020
query_hdr                       34148
query_horror                    75807
query_industrial                69805
query_instagram                 53577
query_landscape                 84209
query_long Exposure             15882
query_macro                     61991
query_melancholy                45922
query_minimal                   67451
query_nature                   104509
query_night                     75123
query_noir                      84710
query_organic                   60437
query_pastel                    66566
query_pensive                   41108
query_portrait                  70789
query_radiant                   43687
query_romantic                  62142
query_sad                       48270
query_scary                     33889
query_sepia                     86783
query_serene                    58312
query_sleek                     61255
query_soft                      86433
query_spring                    57314
query_summer                    87798
query_sunny                     71979
query_tense                     29677
query_texture                   60050
query_upbeat                    29469
query_vintage                  107221
query_washed out                37609
query_winter                    95939
dtype: int64

In [7]:
pins_df = pins_df.join(pins_df_expanded)
del pins_df['query']

In [13]:
query_cols = [_ for _ in pins_df.columns if _.startswith('query_')]
pins_df[query_cols] = pins_df[query_cols].astype(bool)

In [15]:
pins_df.to_hdf('../data/shared/pins_df_mar21.h5', 'df', mode='w', complib='blosc', complevel=9)
!ls -lh ../data/shared


total 1.1G
-rw-r--r-- 1 sergeyk sergeyk  37M Oct 14 18:51 ava.h5
-rw-r--r-- 1 sergeyk sergeyk 1.5M Oct 17 21:04 ava_style.h5
-rw-r--r-- 1 sergeyk sergeyk  35M Oct 15 00:42 ava_urls.h5
-rw-r--r-- 1 sergeyk sergeyk 5.7K Dec 28 20:55 behance_exps.sh
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 28 15:59 db
-rw-r--r-- 1 sergeyk sergeyk 8.6M Mar 20 12:16 flickr_df copy.h5
-rw-r--r-- 1 sergeyk sergeyk 8.6M Mar 20 12:16 flickr_df.h5
-rw-r--r-- 1 sergeyk sergeyk  15M Mar 21 17:15 flickr_df_mar2014.h5
-rw-r--r-- 1 sergeyk sergeyk  12M Feb 11 23:15 ilsvrc2013_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 2.6M Feb 14 02:57 inria_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 5.6M Jan 24 20:09 pascal_VOC2007_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 5.9M Mar 19 00:03 pascal_VOC2012_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 122M Feb 27 22:22 pins_df_feb27.h5
-rw-r--r-- 1 sergeyk sergeyk 216M Feb 28 12:30 pins_df_feb28.h5
-rw-rw-r-- 1 sergeyk sergeyk 504M Mar 21 17:48 pins_df_mar21.h5
drwxr-xr-x 2 sergeyk sergeyk 4.0K Jan 21 15:03 predict
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 17 20:09 redis
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 28 16:31 results
drwxr-xr-x 2 sergeyk sergeyk  64K Dec 18 05:40 rqworkers
-rw-r--r-- 1 sergeyk sergeyk  27M Sep 25 19:11 wikipaintings_basic_info.h5
-rw-r--r-- 1 sergeyk sergeyk  52M Sep 26 02:43 wikipaintings_detailed_info.h5
-rw-r--r-- 1 sergeyk sergeyk  32M Nov 14 12:21 wikipaintings_oct2013.csv
-rw-r--r-- 1 sergeyk sergeyk  13M Sep 26 02:46 wikipaintings_urls.csv
-rw-r--r-- 1 sergeyk sergeyk  17M Sep 26 02:44 wikipaintings_urls.h5
-rw-r--r-- 1 sergeyk sergeyk  15M Sep 26 02:46 wikipaintings_urls.pickle

In [7]:
df = pd.read_hdf('../data/shared/pins_df_80k_mar2014.h5', 'df')

In [8]:
df.columns


Out[8]:
Index([u'style_Detailed', u'style_Pastel', u'style_Melancholy', u'style_Noir', u'style_HDR', u'style_Vintage', u'style_Long_Exposure', u'style_Horror', u'style_Sunny', u'style_Bright', u'style_Hazy', u'style_Bokeh', u'style_Serene', u'style_Texture', u'style_Ethereal', u'style_Macro', u'style_Depth_of_Field', u'style_Geometric_Composition', u'style_Minimal', u'style_Romantic', u'image_url', u'page_url'], dtype='object')

In [9]:
df_ = pd.read_pickle('/Users/karayev/temp.pickle')


Out[9]:
caption
222787512789065621 Gold sequinned bags for confetti toss. Lady Le...
278097345711613547 Staircase at the Bristol Palace Hotel, Genoa, ...
146648531589410451 I+Love+You+Blogs+and+Tea+print+by+jenniferramo...
221520875391577171 Sigmar Polke - Grossmünster Cathedral, Zürich.
95701560802199670 snake leather studs
409616528579827127 Place Vendôme, Paris I
103442122664244081 .
75927943691655222 black and gold
103442122661856360 .those doors!
222787512790891996 Arrowood Photography San Francisco Wedding Pho...
216665432044224432 lorenzo castillo
229402174739636545 pink pout and cute bangs
27232772717846662 orange and pink
263390278181111712 Virgil Finlay’s Pen & Ink Drawings - mashKULTURE
98164466852428985
322359285799114607
204069426839050013 walter groupius door handles // #brass #hardware
274438171016534450 Poppy2 by Donovan Beeson, via Flickr
112027109452018431 Traditional wedding invite on ecru paper mount...
103442122662205909 ...
113575221824766586 wood table with brass base
95420085828451130 The perfect detail in this exuberant silhouett...
558094578792699356 Sinus Trestles// L
213639576042867699 ! Frost Chocolate Cupcakes with a dollop of h...
188517934375335002 Curtain Door by Matharoo Associates
62909726019056846 // mirror
140806212582670 Pathway Stones. I think I can, I think I can!
363313894909823093 making you own dress pins
74802043782676335 VT Interiors - Library of Inspirational Images...
98938523033681564 ---
146648531590527065
260505159669845899 Nice idea for wall numbers. - Metal and Wood?
176977460330741825 McQueen.
102949541453526203 gorgeous globe
349099408585028963 Boots
54043264252713171 Blackened Steel Fireplace from Aguirre Design Inc
442971313320385485 beautiful bohemian bracelet
138345019777658447 Tatting: Bead and Picot Bracelet. Inspiration ...
275564070921310826 Flos
58828338856896493 Sole Society
260505159667449672 Arched pocket doors. #design #interiors
382031980862417180 Burgundy trench.
273875221063062356 botanical branches
565975878143076644 amazingly said
239042692691180167 The Plumed Serpent Bridal
205476801718294880 sit
136796907404834658 Küçük Su Kasrı, İstanbul door
74802043780424913 pink couch.
99501472990454660
102949541453474338 masking tape
480337116478414754 tile
183662491025910067 Marie Antoinette's actual shoe collection
221943087860425247 .
81557443224899487 Nightstand style
344173596494100605 Dior
135952482472695183 program
192599321535737824 ஓPink ஓ
33425222206901632 pair of chic 1950's style wingback chairs
116319602849085314 Viktor & Rolf s/s 2011
430304939367767173 Versace Art Deco
...

80000 rows × 1 columns


In [13]:
df = df.join(df_)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-13-67a3b8217659> in <module>()
----> 1 df = df.join(df_)
      2 df

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in join(self, other, on, how, lsuffix, rsuffix, sort)
   3578         # For SparseDataFrame's benefit
   3579         return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
-> 3580                                  rsuffix=rsuffix, sort=sort)
   3581 
   3582     def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _join_compat(self, other, on, how, lsuffix, rsuffix, sort)
   3592             return merge(self, other, left_on=on, how=how,
   3593                          left_index=on is None, right_index=True,
-> 3594                          suffixes=(lsuffix, rsuffix), sort=sort)
   3595         else:
   3596             if on is not None:

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
     38                          right_index=right_index, sort=sort, suffixes=suffixes,
     39                          copy=copy)
---> 40     return op.get_result()
     41 if __debug__:
     42     merge.__doc__ = _merge_doc % '\nleft : DataFrame'

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in get_result(self)
    187 
    188         # this is a bit kludgy
--> 189         ldata, rdata = self._get_merge_data()
    190 
    191         # TODO: more efficiently handle group keys to avoid extra

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_merge_data(self)
    282         lsuf, rsuf = self.suffixes
    283         ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf,
--> 284                                                 copydata=False)
    285         return ldata, rdata
    286 

/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/internals.pyc in _maybe_rename_join(self, other, lsuffix, rsuffix, copydata)
   3441             if not lsuffix and not rsuffix:
   3442                 raise ValueError('columns overlap but no suffix specified: %s'
-> 3443                                  % to_rename)
   3444 
   3445             def lrenamer(x):

ValueError: columns overlap but no suffix specified: Index([u'caption'], dtype='object')

In [16]:
df.to_hdf('../data/shared/pins_df_80k_mar2014.h5', 'df', mode='w', complib='blosc', complevel=9)


/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:1251: DeprecationWarning: createGroup() is pending deprecation, use create_group() instead. You may use the pt2to3 tool to update your source code.
  group = self._handle.createGroup(path, p)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
  _tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis1] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
  _tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2422: DeprecationWarning: createCArray() is pending deprecation, use create_carray() instead. You may use the pt2to3 tool to update your source code.
  filters=self._filters)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2422: DeprecationWarning: createCArray() is pending deprecation, use create_carray() instead. You may use the pt2to3 tool to update your source code.
  filters=self._filters)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['image_url', 'page_url', u'caption']]

  warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
  _tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning: 
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]

  warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
  _tables().ObjectAtom())