In [2]:
%load_ext autoreload
%autoreload 2
from pprint import pprint
import pandas as pd
import pymongo
import numpy as np
import time
import vislab
In [7]:
db = vislab.util.get_mongodb_client()['pinscraping']
print db.collection_names()
[]
In [4]:
print(db['pins'].count())
pprint(db['pins'].find_one())
3172020
{u'_id': u'www.pinterest.com/pin/100205160428494657/',
u'board_name': u'soft-mint',
u'caption': u'cupcakes',
u'img': u'http://media-cache-ec0.pinimg.com/736x/cd/49/9c/cd499c1fdd84af17856e0a219031a5c4.jpg',
u'pin_url': u'www.pinterest.com/pin/100205160428494657/',
u'query': [u'soft'],
u'repins': 25,
u'repins_likes_url': [u'www.pinterest.com/pin/100205160428494657/repins/',
u'www.pinterest.com/pin/100205160428494657/likes/'],
u'source': u'sprinklebakes.com',
u'username': u'rainqueen'}
In [5]:
# Make DataFrame, and use the unique pin id as image_id.
pins_df = pd.DataFrame(list(db['pins'].find(
fields=['img', 'query', 'repins', 'board_name', 'username', 'caption', '_id'])))
pins_df.index = pd.Index([_.split('/')[-2] for _ in pins_df['_id']], name='image_id')
del pins_df['_id']
In [6]:
# Expand the query list to boolean columns.
pins_df_expanded = pd.DataFrame(
data=[
dict([('query_' + _, True) for _ in row])
for row in pins_df['query']
],
index=pins_df.index
).fillna(False)
print(pins_df_expanded.sum(0))
query_animals 124540
query_black and white 120075
query_bokeh 56745
query_bright 87402
query_calm 80710
query_cloudy 50980
query_corporate 60794
query_depth of field 11824
query_detailed 77693
query_energetic 42013
query_ethereal 70961
query_fall 100011
query_futuristic 67141
query_geometric composition 6473
query_happy 101777
query_hazy 43020
query_hdr 34148
query_horror 75807
query_industrial 69805
query_instagram 53577
query_landscape 84209
query_long Exposure 15882
query_macro 61991
query_melancholy 45922
query_minimal 67451
query_nature 104509
query_night 75123
query_noir 84710
query_organic 60437
query_pastel 66566
query_pensive 41108
query_portrait 70789
query_radiant 43687
query_romantic 62142
query_sad 48270
query_scary 33889
query_sepia 86783
query_serene 58312
query_sleek 61255
query_soft 86433
query_spring 57314
query_summer 87798
query_sunny 71979
query_tense 29677
query_texture 60050
query_upbeat 29469
query_vintage 107221
query_washed out 37609
query_winter 95939
dtype: int64
In [7]:
pins_df = pins_df.join(pins_df_expanded)
del pins_df['query']
In [13]:
query_cols = [_ for _ in pins_df.columns if _.startswith('query_')]
pins_df[query_cols] = pins_df[query_cols].astype(bool)
In [15]:
pins_df.to_hdf('../data/shared/pins_df_mar21.h5', 'df', mode='w', complib='blosc', complevel=9)
!ls -lh ../data/shared
total 1.1G
-rw-r--r-- 1 sergeyk sergeyk 37M Oct 14 18:51 ava.h5
-rw-r--r-- 1 sergeyk sergeyk 1.5M Oct 17 21:04 ava_style.h5
-rw-r--r-- 1 sergeyk sergeyk 35M Oct 15 00:42 ava_urls.h5
-rw-r--r-- 1 sergeyk sergeyk 5.7K Dec 28 20:55 behance_exps.sh
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 28 15:59 db
-rw-r--r-- 1 sergeyk sergeyk 8.6M Mar 20 12:16 flickr_df copy.h5
-rw-r--r-- 1 sergeyk sergeyk 8.6M Mar 20 12:16 flickr_df.h5
-rw-r--r-- 1 sergeyk sergeyk 15M Mar 21 17:15 flickr_df_mar2014.h5
-rw-r--r-- 1 sergeyk sergeyk 12M Feb 11 23:15 ilsvrc2013_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 2.6M Feb 14 02:57 inria_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 5.6M Jan 24 20:09 pascal_VOC2007_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 5.9M Mar 19 00:03 pascal_VOC2012_dfs.h5
-rw-r--r-- 1 sergeyk sergeyk 122M Feb 27 22:22 pins_df_feb27.h5
-rw-r--r-- 1 sergeyk sergeyk 216M Feb 28 12:30 pins_df_feb28.h5
-rw-rw-r-- 1 sergeyk sergeyk 504M Mar 21 17:48 pins_df_mar21.h5
drwxr-xr-x 2 sergeyk sergeyk 4.0K Jan 21 15:03 predict
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 17 20:09 redis
drwxr-xr-x 2 sergeyk sergeyk 4.0K Dec 28 16:31 results
drwxr-xr-x 2 sergeyk sergeyk 64K Dec 18 05:40 rqworkers
-rw-r--r-- 1 sergeyk sergeyk 27M Sep 25 19:11 wikipaintings_basic_info.h5
-rw-r--r-- 1 sergeyk sergeyk 52M Sep 26 02:43 wikipaintings_detailed_info.h5
-rw-r--r-- 1 sergeyk sergeyk 32M Nov 14 12:21 wikipaintings_oct2013.csv
-rw-r--r-- 1 sergeyk sergeyk 13M Sep 26 02:46 wikipaintings_urls.csv
-rw-r--r-- 1 sergeyk sergeyk 17M Sep 26 02:44 wikipaintings_urls.h5
-rw-r--r-- 1 sergeyk sergeyk 15M Sep 26 02:46 wikipaintings_urls.pickle
In [7]:
df = pd.read_hdf('../data/shared/pins_df_80k_mar2014.h5', 'df')
In [8]:
df.columns
Out[8]:
Index([u'style_Detailed', u'style_Pastel', u'style_Melancholy', u'style_Noir', u'style_HDR', u'style_Vintage', u'style_Long_Exposure', u'style_Horror', u'style_Sunny', u'style_Bright', u'style_Hazy', u'style_Bokeh', u'style_Serene', u'style_Texture', u'style_Ethereal', u'style_Macro', u'style_Depth_of_Field', u'style_Geometric_Composition', u'style_Minimal', u'style_Romantic', u'image_url', u'page_url'], dtype='object')
In [9]:
df_ = pd.read_pickle('/Users/karayev/temp.pickle')
Out[9]:
caption
222787512789065621
Gold sequinned bags for confetti toss. Lady Le...
278097345711613547
Staircase at the Bristol Palace Hotel, Genoa, ...
146648531589410451
I+Love+You+Blogs+and+Tea+print+by+jenniferramo...
221520875391577171
Sigmar Polke - Grossmünster Cathedral, Zürich.
95701560802199670
snake leather studs
409616528579827127
Place Vendôme, Paris I
103442122664244081
.
75927943691655222
black and gold
103442122661856360
.those doors!
222787512790891996
Arrowood Photography San Francisco Wedding Pho...
216665432044224432
lorenzo castillo
229402174739636545
pink pout and cute bangs
27232772717846662
orange and pink
263390278181111712
Virgil Finlay’s Pen & Ink Drawings - mashKULTURE
98164466852428985
322359285799114607
204069426839050013
walter groupius door handles // #brass #hardware
274438171016534450
Poppy2 by Donovan Beeson, via Flickr
112027109452018431
Traditional wedding invite on ecru paper mount...
103442122662205909
...
113575221824766586
wood table with brass base
95420085828451130
The perfect detail in this exuberant silhouett...
558094578792699356
Sinus Trestles// L
213639576042867699
! Frost Chocolate Cupcakes with a dollop of h...
188517934375335002
Curtain Door by Matharoo Associates
62909726019056846
// mirror
140806212582670
Pathway Stones. I think I can, I think I can!
363313894909823093
making you own dress pins
74802043782676335
VT Interiors - Library of Inspirational Images...
98938523033681564
---
146648531590527065
260505159669845899
Nice idea for wall numbers. - Metal and Wood?
176977460330741825
McQueen.
102949541453526203
gorgeous globe
349099408585028963
Boots
54043264252713171
Blackened Steel Fireplace from Aguirre Design Inc
442971313320385485
beautiful bohemian bracelet
138345019777658447
Tatting: Bead and Picot Bracelet. Inspiration ...
275564070921310826
Flos
58828338856896493
Sole Society
260505159667449672
Arched pocket doors. #design #interiors
382031980862417180
Burgundy trench.
273875221063062356
botanical branches
565975878143076644
amazingly said
239042692691180167
The Plumed Serpent Bridal
205476801718294880
sit
136796907404834658
Küçük Su Kasrı, İstanbul door
74802043780424913
pink couch.
99501472990454660
102949541453474338
masking tape
480337116478414754
tile
183662491025910067
Marie Antoinette's actual shoe collection
221943087860425247
.
81557443224899487
Nightstand style
344173596494100605
Dior
135952482472695183
program
192599321535737824
ஓPink ஓ
33425222206901632
pair of chic 1950's style wingback chairs
116319602849085314
Viktor & Rolf s/s 2011
430304939367767173
Versace Art Deco
...
80000 rows × 1 columns
In [13]:
df = df.join(df_)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-13-67a3b8217659> in <module>()
----> 1 df = df.join(df_)
2 df
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in join(self, other, on, how, lsuffix, rsuffix, sort)
3578 # For SparseDataFrame's benefit
3579 return self._join_compat(other, on=on, how=how, lsuffix=lsuffix,
-> 3580 rsuffix=rsuffix, sort=sort)
3581
3582 def _join_compat(self, other, on=None, how='left', lsuffix='', rsuffix='',
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/frame.pyc in _join_compat(self, other, on, how, lsuffix, rsuffix, sort)
3592 return merge(self, other, left_on=on, how=how,
3593 left_index=on is None, right_index=True,
-> 3594 suffixes=(lsuffix, rsuffix), sort=sort)
3595 else:
3596 if on is not None:
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in merge(left, right, how, on, left_on, right_on, left_index, right_index, sort, suffixes, copy)
38 right_index=right_index, sort=sort, suffixes=suffixes,
39 copy=copy)
---> 40 return op.get_result()
41 if __debug__:
42 merge.__doc__ = _merge_doc % '\nleft : DataFrame'
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in get_result(self)
187
188 # this is a bit kludgy
--> 189 ldata, rdata = self._get_merge_data()
190
191 # TODO: more efficiently handle group keys to avoid extra
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/tools/merge.pyc in _get_merge_data(self)
282 lsuf, rsuf = self.suffixes
283 ldata, rdata = ldata._maybe_rename_join(rdata, lsuf, rsuf,
--> 284 copydata=False)
285 return ldata, rdata
286
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/core/internals.pyc in _maybe_rename_join(self, other, lsuffix, rsuffix, copydata)
3441 if not lsuffix and not rsuffix:
3442 raise ValueError('columns overlap but no suffix specified: %s'
-> 3443 % to_rename)
3444
3445 def lrenamer(x):
ValueError: columns overlap but no suffix specified: Index([u'caption'], dtype='object')
In [16]:
df.to_hdf('../data/shared/pins_df_80k_mar2014.h5', 'df', mode='w', complib='blosc', complevel=9)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:1251: DeprecationWarning: createGroup() is pending deprecation, use create_group() instead. You may use the pt2to3 tool to update your source code.
group = self._handle.createGroup(path, p)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]
warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
_tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->unicode,key->axis1] [items->None]
warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
_tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2422: DeprecationWarning: createCArray() is pending deprecation, use create_carray() instead. You may use the pt2to3 tool to update your source code.
filters=self._filters)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2422: DeprecationWarning: createCArray() is pending deprecation, use create_carray() instead. You may use the pt2to3 tool to update your source code.
filters=self._filters)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->['image_url', 'page_url', u'caption']]
warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
_tables().ObjectAtom())
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2446: PerformanceWarning:
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]
warnings.warn(ws, PerformanceWarning)
/usr/local/Cellar/python/2.7.6/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/pandas/io/pytables.py:2449: DeprecationWarning: createVLArray() is pending deprecation, use create_vlarray() instead. You may use the pt2to3 tool to update your source code.
_tables().ObjectAtom())
Content source: Jai-Chaudhary/vislab
Similar notebooks: