In [120]:
import pandas as pd
import numpy as np
import os
from urllib import urlretrieve
import time
import glob

In [3]:
os.chdir('/Users/Walkon302/Desktop/deep-learning-models-master/view2buy')

In [10]:
# load the image url file
url = pd.read_csv('eval_img_url.csv', header=None)

In [15]:
# Check the url file
url.head()


Out[15]:
spu url
0 100000000317431808 http://a.vpimg2.com/upload/merchandise/pdc/808...
1 100000001066491904 http://a.vpimg2.com/upload/merchandise/pdc/904...
2 100000005622976512 http://a.vpimg2.com/upload/merchandise/pdc/512...
3 100000009136676865 http://a.vpimg2.com/upload/merchandise/pdcvis/...
4 100000009494401198 http://a.vpimg2.com/upload/merchandise/pdcvis/...

In [7]:
# Load the filtered view2buy file containing around 300k files.
df = pd.read_pickle('user_fea_for_eval.pkl')

In [29]:
# Filtered view2buy data againg and select the items that we have url for.
df_url_count = df[(df['buy_spu'].isin(url['spu'])) & (df['view_spu'].isin(url['spu']))].groupby('user_id').count()

In [54]:
df_url = df[(df['buy_spu'].isin(url['spu'])) & (df['view_spu'].isin(url['spu']))]

In [53]:
df[(df['buy_spu'].isin(url['spu'])) & (df['view_spu'].isin(url['spu'])) & (df['buy_spu'] == df['view_spu'])].shape


Out[53]:
(30203, 13)

In [39]:
# Reset the index
df_url_count = df_url_count.reset_index()

In [44]:
# Save the id that contains more than 20 browing history. for frther examination. We get around 
user_id = df_url_count[df_url_count['view_spu'] > 20]['user_id']

In [70]:
# Select data with selected parameters
df_url = df_url[(df_url['user_id'].isin(user_id))]

In [73]:
df_url2 = pd.merge(df_url, url, left_on='view_spu', right_on='spu')

In [79]:
df_url2.head()


Out[79]:
0 user_id buy_spu buy_sn buy_ct3 view_spu view_sn view_ct3 time_interval view_cnt view_secondes view_features buy_features spu url
0 4209887493\t453532580309307392\t10004616\t334\... 4209887493 453532580309307392 10004616 334 14150170026959126 10010102 334 21114 1 11 [0.135, 1.078, 0.06, 0.241, 0.213, 0.22, 0.039... [0.1, 1.804, 0.049, 0.883, 0.092, 0.053, 0.042... 14150170026959126 http://a.vpimg2.com/upload/merchandise/pdcvis/...
1 529805243\t103096245561765919\t10010102\t334\t... 529805243 103096245561765919 10010102 334 14150170026959126 10010102 334 37794 4 66 [0.135, 1.078, 0.06, 0.241, 0.213, 0.22, 0.039... [0.467, 0.385, 0.0, 0.043, 0.292, 0.0, 0.448, ... 14150170026959126 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2 3748045464\t446777176556679168\t10005711\t334\... 3748045464 446777176556679168 10005711 334 14150170026959126 10010102 334 18820 1 34 [0.135, 1.078, 0.06, 0.241, 0.213, 0.22, 0.039... [0.018, 0.161, 0.088, 0.141, 0.231, 0.0, 0.036... 14150170026959126 http://a.vpimg2.com/upload/merchandise/pdcvis/...
3 4209887493\t438895881520357521\t10004616\t334\... 4209887493 438895881520357521 10004616 334 14150170026959126 10010102 334 13978 1 11 [0.135, 1.078, 0.06, 0.241, 0.213, 0.22, 0.039... [0.036, 0.439, 0.0, 0.074, 0.194, 0.0, 0.331, ... 14150170026959126 http://a.vpimg2.com/upload/merchandise/pdcvis/...
4 4209887493\t74104320184119307\t10004616\t334\t... 4209887493 74104320184119307 10004616 334 14150170026959126 10010102 334 14313 1 11 [0.135, 1.078, 0.06, 0.241, 0.213, 0.22, 0.039... [0.078, 2.304, 0.132, 0.191, 0.0, 0.087, 0.341... 14150170026959126 http://a.vpimg2.com/upload/merchandise/pdcvis/...

In [76]:
#df_url2.to_pickle('view2buy_url.pkl')

In [110]:
view_url = pd.DataFrame(df_url2.groupby(['view_spu', 'url']).count().reset_index()[['view_spu', 'url']])

In [111]:
view_url


Out[111]:
view_spu url
0 357872333107204 http://a.vpimg2.com/upload/merchandise/pdc/204...
1 357875526680651 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2 357882254983171 http://a.vpimg2.com/upload/merchandise/pdcvis/...
3 357901107539985 http://a.vpimg2.com/upload/merchandise/pdcvis/...
4 639360131194904 http://a.vpimg2.com/upload/merchandise/pdcvis/...
5 639369692328147 http://a.vpimg2.com/upload/merchandise/pdcvis/...
6 639371126526005 http://a.vpimg2.com/upload/merchandise/pdcvis/...
7 639371126526005 http://a.vpimg2.com/upload/merchandise/pdcvis/...
8 639389503180805 http://a.vpimg2.com/upload/merchandise/pdcvis/...
9 639389503180805 http://a.vpimg2.com/upload/merchandise/pdcvis/...
10 639392717246493 http://a.vpimg2.com/upload/merchandise/pdcvis/...
11 920816362999808 http://a.vpimg2.com/upload/merchandise/pdcvis/...
12 920832942268416 http://a.vpimg2.com/upload/merchandise/pdcvis/...
13 920846404657158 http://a.vpimg2.com/upload/merchandise/pdcvis/...
14 1202299096490170 http://a.vpimg2.com/upload/merchandise/pdcvis/...
15 1202310084616220 http://a.vpimg2.com/upload/merchandise/pdcvis/...
16 1202315469205505 http://a.vpimg2.com/upload/merchandise/pdcvis/...
17 1202326037671948 http://a.vpimg2.com/upload/merchandise/pdcvis/...
18 1202342670667978 http://a.vpimg2.com/upload/merchandise/pdcvis/...
19 1202342670671910 http://a.vpimg2.com/upload/merchandise/pdcvis/...
20 1483782161825904 http://a.vpimg2.com/upload/merchandise/pdcvis/...
21 1765254792024080 http://a.vpimg2.com/upload/merchandise/pdcvis/...
22 2046724026622135 http://a.vpimg2.com/upload/merchandise/pdcvis/...
23 2046767967338535 http://a.vpimg2.com/upload/merchandise/pdcvis/...
24 2046769978417582 http://a.vpimg2.com/upload/merchandise/pdcvis/...
25 2609679722156042 http://a.vpimg2.com/upload/merchandise/pdcvis/...
26 2891145450029056 http://a.vpimg2.com/upload/merchandise/pdcvis/...
27 2891152447287296 http://a.vpimg2.com/upload/merchandise/pdcvis/...
28 2891160600990330 http://a.vpimg2.com/upload/merchandise/pdcvis/...
29 2891169552208298 http://a.vpimg2.com/upload/merchandise/pdcvis/...
... ... ...
2567 9014593909249236993 http://a.vpimg2.com/upload/merchandise/pdc/993...
2568 9015719808079347717 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2569 9016845703713148936 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2570 9017127181215457486 http://a.vpimg2.com/upload/merchandise/pdc/486...
2571 9017971603620016137 http://a.vpimg2.com/upload/merchandise/pdc/137...
2572 9017971606044979200 http://a.vpimg2.com/upload/merchandise/pdc/200...
2573 9018816028550127616 http://a.vpimg2.com/upload/merchandise/pdc/616...
2574 9022193728259383296 http://a.vpimg2.com/upload/merchandise/pdc/296...
2575 9022475203247398916 http://a.vpimg2.com/upload/merchandise/pdc/916...
2576 9022756682497105932 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2577 9023882578131013643 http://a.vpimg2.com/upload/merchandise/pdc/643...
2578 9024727008410832903 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2579 9026415852921315334 http://a.vpimg2.com/upload/merchandise/pdc/334...
2580 9026697332171030534 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2581 9027823227804868619 http://a.vpimg2.com/upload/merchandise/pdc/619...
2582 9031763960910209025 http://a.vpimg2.com/upload/merchandise/pdc/025...
2583 9033171252362444801 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2584 9033734206588821512 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2585 9035141577199349769 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2586 9036830509787594752 http://a.vpimg2.com/upload/merchandise/pdc/752...
2587 9037674851989819400 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2588 9038237806216196097 http://a.vpimg2.com/upload/merchandise/pdc/097...
2589 9039645182176481286 http://a.vpimg2.com/upload/merchandise/pdc/286...
2590 9039926651803541506 http://a.vpimg2.com/upload/merchandise/pdc/506...
2591 9041615583843246080 http://a.vpimg2.com/upload/merchandise/pdc/080...
2592 9088621908743286785 http://a.vpimg2.com/upload/merchandise/pdc/785...
2593 9089747807251402752 http://a.vpimg2.com/upload/merchandise/pdc/752...
2594 9090029283626840066 http://a.vpimg2.com/upload/merchandise/pdc/066...
2595 9090592232181542912 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2596 9094251405871296512 http://a.vpimg2.com/upload/merchandise/pdc/512...

2597 rows × 2 columns


In [114]:
for index, row in view_url.iterrows():                                      
    try:
        urlretrieve(row['url'], 'view_data_image/{}.jpg'.format(row['view_spu']))
        time.sleep(10)
    except Exception as e:
        print 'index:{}, filename: {}, url: {}'.format(index, row['view_spu'], row['url'])
        print 'exception type: {}, args: {}, exception: {}'.format(type(e), e.args, e)


index:2, filename: 357882254983171, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/09/19/38/4dd42d3c-ac7f-47f4-9e00-787e63dce01e.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 4380 out of 60704 bytes',), exception: retrieval incomplete: got only 4380 out of 60704 bytes
index:13, filename: 920846404657158, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/29/68/a40ef312-845c-41d5-a852-8401d16c1a59.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:15, filename: 1202310084616220, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/07/130/9a5a7f60-14f6-42d3-8ba0-302a0c4e6dc9.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 50167 bytes',), exception: retrieval incomplete: got only 0 out of 50167 bytes
index:22, filename: 2046724026622135, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/08/27/35/8b7971f0-ad0b-466d-aa32-7902f85189fb.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 1460 out of 25366 bytes',), exception: retrieval incomplete: got only 1460 out of 25366 bytes
index:27, filename: 2891152447287296, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/09/06/13/bbe7d4f9-1758-4bb6-9527-26813985d859.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 40444 out of 58193 bytes',), exception: retrieval incomplete: got only 40444 out of 58193 bytes
index:216, filename: 24001791427289098, url: http://a.vpimg2.com/upload/merchandise/pdc/098/289/24001791427289098/1/1YY3035460050-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 12572 out of 33562 bytes',), exception: retrieval incomplete: got only 12572 out of 33562 bytes
index:320, filename: 34134875562242082, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/09/08/136/7883e8f6-2b89-4303-b0cb-4094f3c12416.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 13768 out of 71537 bytes',), exception: retrieval incomplete: got only 13768 out of 71537 bytes
index:329, filename: 34697864820551702, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/12/30/17/20ca2d85-f7d3-4b72-ad5b-df7cce6d052e.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 4096 out of 56464 bytes',), exception: retrieval incomplete: got only 4096 out of 56464 bytes
index:341, filename: 72415513913565187, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/14/153/4f8105f6-6838-41a6-8982-36887e6cb9fe.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 4942 out of 31535 bytes',), exception: retrieval incomplete: got only 4942 out of 31535 bytes
index:514, filename: 89304008517624104, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/12/23/175/9dadc353-0dfb-4a71-9ca3-d08f0c949707.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:601, filename: 97185265860571162, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/08/26/64/8c23e513-ce72-499d-bc62-218d2bc78304.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 4380 out of 25253 bytes',), exception: retrieval incomplete: got only 4380 out of 25253 bytes
index:770, filename: 294217768972480514, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/25/109/2225d9da-eef5-4014-b9e3-df23bf0a4fc8.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 8760 out of 72331 bytes',), exception: retrieval incomplete: got only 8760 out of 72331 bytes
index:829, filename: 298439914808242231, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/19/65/098476fc-706b-4882-bd41-df02b2dbe844.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:1046, filename: 316735778810167299, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/23/103/0c26dc30-5b38-4893-be67-4d747e9f5208.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:1072, filename: 319550517313740802, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/23/5/51fff803-0eaf-432c-8f2d-188f11a4d7ec.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 14962 bytes',), exception: retrieval incomplete: got only 0 out of 14962 bytes
index:1170, filename: 438614409363570705, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/09/01/91/c87ecf96-b1fb-44c4-83ac-2693d546d578.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 7016 out of 41875 bytes',), exception: retrieval incomplete: got only 7016 out of 41875 bytes
index:1172, filename: 438614435236745234, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/01/136/ba7c3051-469d-43fd-91f0-41023fa1caf1.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 2920 out of 46864 bytes',), exception: retrieval incomplete: got only 2920 out of 46864 bytes
index:1179, filename: 439458834293702666, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/09/14/118/91645600-82b6-4790-b56d-ed1074915229.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 25448 out of 37777 bytes',), exception: retrieval incomplete: got only 25448 out of 37777 bytes
index:1182, filename: 439458877074387002, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/12/14/63/4c25d43d-d240-47bd-8467-1326bfea3959.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 13768 out of 45119 bytes',), exception: retrieval incomplete: got only 13768 out of 45119 bytes
index:1420, filename: 457191792472825882, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/12/05/19/693e7396-b294-463a-acc2-01de3498c72b.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 29200 out of 39912 bytes',), exception: retrieval incomplete: got only 29200 out of 39912 bytes
index:1467, filename: 461413925257830545, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/24/94/bb9d4d9c-a99a-492a-80ea-444288151669.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:1468, filename: 461695358101766158, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/08/23/33/5e31957b-7d46-4c4f-93ac-ef8c76cbc6aa.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:1583, filename: 963565080649408512, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/25/6/597de7ee-205a-47bf-9622-90806598dee5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 4380 out of 16794 bytes',), exception: retrieval incomplete: got only 4380 out of 16794 bytes
index:1619, filename: 1082629013079781376, url: http://a.vpimg2.com/upload/merchandise/pdc/376/781/1082629013079781376/1/1051190-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 19793 bytes',), exception: retrieval incomplete: got only 0 out of 19793 bytes
index:1722, filename: 2191359016057393152, url: http://a.vpimg2.com/upload/merchandise/pdc/152/393/2191359016057393152/0/1682430-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 7300 out of 22591 bytes',), exception: retrieval incomplete: got only 7300 out of 22591 bytes
index:1850, filename: 2826929451589525504, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/09/138/57b8d95c-2270-4efc-af22-44adda57040d_t.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 98835 bytes',), exception: retrieval incomplete: got only 0 out of 98835 bytes
index:1852, filename: 2827492402281959424, url: http://a.vpimg2.com/upload/merchandise/pdc/424/959/2827492402281959424/0/1060546-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 26327 bytes',), exception: retrieval incomplete: got only 0 out of 26327 bytes
index:1862, filename: 2832840426724958208, url: http://a.vpimg2.com/upload/merchandise/pdc/208/958/2832840426724958208/0/fd38ee57-7a10-4230-a078-cc3433e93abf.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 0 out of 278 bytes',), exception: retrieval incomplete: got only 0 out of 278 bytes
index:1875, filename: 2836781075773157376, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/10/157/cf97fe2b-1b9d-448d-8cfe-a2b4c72a5e25.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 2920 out of 25230 bytes',), exception: retrieval incomplete: got only 2920 out of 25230 bytes
index:1887, filename: 2845506800768520192, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/09/7/6c5695d8-5188-4efa-9bf6-0d2a495c239d_t.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 41508 out of 96337 bytes',), exception: retrieval incomplete: got only 41508 out of 96337 bytes
index:1997, filename: 3822225051725512705, url: http://a.vpimg2.com/upload/merchandise/pdc/705/512/3822225051725512705/0/1901500-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 14600 out of 18553 bytes',), exception: retrieval incomplete: got only 14600 out of 18553 bytes
index:2048, filename: 4063730591896084758, url: http://a.vpimg2.com/upload/merchandise/pdc/758/084/4063730591896084758/3/1759647-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 7016 out of 16669 bytes',), exception: retrieval incomplete: got only 7016 out of 16669 bytes
index:2191, filename: 4428803637838594048, url: http://a.vpimg2.com/upload/merchandise/pdc/048/594/4428803637838594048/0/2123114-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 14168 out of 14506 bytes',), exception: retrieval incomplete: got only 14168 out of 14506 bytes
index:2200, filename: 4496076157272420353, url: http://a.vpimg2.com/upload/merchandise/pdc/353/420/4496076157272420353/0/2123099-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 13160 out of 17378 bytes',), exception: retrieval incomplete: got only 13160 out of 17378 bytes
index:2203, filename: 4627806448492449792, url: http://a.vpimg2.com/upload/merchandise/pdc/792/449/4627806448492449792/0/1755203-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 12626 out of 19516 bytes',), exception: retrieval incomplete: got only 12626 out of 19516 bytes
index:2209, filename: 4706619441972056064, url: http://a.vpimg2.com/upload/merchandise/pdc/064/056/4706619441972056064/1/1755209-5.jpg
exception type: <class 'urllib.ContentTooShortError'>, args: ('retrieval incomplete: got only 5556 out of 15390 bytes',), exception: retrieval incomplete: got only 5556 out of 15390 bytes
index:2300, filename: 6586027870691340619, url: http://a.vpimg2.com/upload/merchandise/pdc/619/340/6586027870691340619/0/1587363-5.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:2314, filename: 6853429096767770624, url: http://a.vpimg2.com/upload/merchandise/pdc/624/770/6853429096767770624/0/1903601-5.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:2414, filename: 8462621471316647936, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/10/24/97/a8c9d912-2f59-4c20-8fe7-977e2d3f8a2e_t.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)

In [182]:
needfixed = glob.glob('NeedFixed/*.jpg')

In [183]:
a = pd.DataFrame([x[10:-4] for x in needfixed])

In [184]:
a.columns = ['spu']

In [188]:
a['spu']= a['spu'].astype(int)

In [189]:
b = pd.merge(a, view_url, left_on='spu', right_on='view_spu')

In [191]:
b.head()


Out[191]:
spu view_spu url
0 10491016902120013 10491016902120013 http://a.vpimg2.com/upload/merchandise/pdcvis/...
1 1202342670667978 1202342670667978 http://a.vpimg2.com/upload/merchandise/pdcvis/...
2 1202342670671910 1202342670671910 http://a.vpimg2.com/upload/merchandise/pdcvis/...
3 12461341739094214 12461341739094214 http://a.vpimg2.com/upload/merchandise/pdcvis/...
4 16120516436332544 16120516436332544 http://a.vpimg2.com/upload/merchandise/pdcvis/...

In [192]:
b.shape


Out[192]:
(51, 3)

In [193]:
for index, row in b.iterrows():                                      
    try:
        urlretrieve(row['url'], 'fixed_data_image/{}.jpg'.format(row['view_spu']))
        time.sleep(3)
    except Exception as e:
        print 'index:{}, filename: {}, url: {}'.format(index, row['view_spu'], row['url'])
        print 'exception type: {}, args: {}, exception: {}'.format(type(e), e.args, e)


index:29, filename: 448747540332511261, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/18/138/95c7b5ec-73f8-440a-a1f1-3086dcdc32fa.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:30, filename: 449873443790426124, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/28/45/cf305005-cd37-4ab1-9c94-f62141686804.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:31, filename: 450154915216064523, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/18/156/fb3eb7cb-da23-4011-9976-1dcf46d86e7d.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)
index:32, filename: 451843765076328474, url: http://a.vpimg2.com/upload/merchandise/pdcvis/2016/11/18/140/3db1a419-59c3-4a44-8deb-04f066383ce2.jpg
exception type: <type 'exceptions.IOError'>, args: ('http protocol error', 0, 'got a bad status line', None), exception: ('http protocol error', 0, 'got a bad status line', None)

In [ ]: