Create video database

In this notebook we create our video database based on research terms.

The research terms have to be written in the "searches.txt" file, each new line is a new research. We can have as many searches as we want.

The number of videos we get for each search line is 50 by default but it can be modified in the "Get a list of videos IDs" section.


In [1]:
VIDEOS_REQUEST_ID_LIMIT = 50 
CHANNEL_REQUEST_ID_LIMIT = 50

key1 = "KEY"
key2 = "KEY"

DEVELOPER_KEY = key2

import requests
import json
import pandas as pd
from math import *
import numpy as np
import tensorflow as tf
import time
import collections
import os
import timeit

from IPython.display import display

In [ ]:
#where the database will be stored
folder = os.path.join('sql_database')

#how many videos per search we want to get (max=500, multiple of 50)
video_per_search = 150

Functions

The functions used to get the channels and videos informations


In [3]:
#----------------------------------------------------------------------------------------
# VIDEO REQUEST
# get the informations of the videos
# id_list : list of the ids of the videos we want to get the informations from
# db      : the video dataframe where we will add the videos informations
#----------------------------------------------------------------------------------------
def videos_request(id_list,db):
    
    nbr_videos = len(id_list)
    
    #limit of request per "get"
    nbr_requests = int(np.ceil(nbr_videos/VIDEOS_REQUEST_ID_LIMIT))
    
    print('videos request(): number of videos:' , nbr_videos)
    print('videos request(): number of requests:' , nbr_requests)
    
    for i in range(nbr_requests):
        print('videos request(): request:' , i+1, '/',nbr_requests)

        limit = VIDEOS_REQUEST_ID_LIMIT
        if limit > nbr_videos:
            limit = nbr_videos
            
        nbr_videos -= VIDEOS_REQUEST_ID_LIMIT
    
        #concatenate the videos ids
        req_id = id_list[i*VIDEOS_REQUEST_ID_LIMIT]
        for j in range(limit-1):
            req_id += ','+ id_list[i*VIDEOS_REQUEST_ID_LIMIT+j+1]
        
        #we get "snippet" and "statistics"
        url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id={}&key={}'.format(req_id, DEVELOPER_KEY)

        r = requests.get(url);
    
        #print(r.text) #if we want to print the json response
    
        data = json.loads(r.text)
        
        # if we don't have any error we should have items
        if 'items' in data:
            for item in data['items']:
                serie = dict()
                serie['id']            = item['id']
                serie['channelId']     = item['snippet']['channelId']
                serie['title']         = item['snippet']['title']
                serie['thumbnailUrl']  = item['snippet']['thumbnails']['default']['url'] 
                serie['viewCount']     = item['statistics']['viewCount']

                #sometimes those are not send for some reason, we check if there are in the response
                if 'likeCount' in item['statistics']:
                    serie['likeCount'] = item['statistics']['likeCount']
                else:
                    serie['likeCount'] = '0'

                if 'dislikeCount' in item['statistics']:
                    serie['dislikeCount'] = item['statistics']['dislikeCount']
                else:
                    serie['dislikeCount'] = '0'

                if 'commentCount' in item['statistics']:
                    serie['commentCount'] = item['statistics']['commentCount']
                else:
                    serie['commentCount'] = '0'

                serie['subsCount']  =  'na'

                db = db.append(serie, ignore_index=True)
        else:
            print('videos request(): error: no items in data')
            
    #return the new dataframe
    return db;

#----------------------------------------------------------------------------------------
# VIDEO ID LIST FROM CHANNEL ID
# get a list of videos from the channel IDs
# id channel : list of channel IDs
#----------------------------------------------------------------------------------------
def videos_id_list_from_channel_id(id_channel):
    
    #prepare the videos list to return
    vid_list = [];
    
    print('videos_id_list_from_channel_id(): number of channels:', len(id_channel))
    
    for n in range (len(id_channel)):
        channel_id = id_channel[n]
  
        r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&publishedBefore=2016-07-07T00:00:00Z&maxResults=20&type=video&channelId={}&key={}'.format(channel_id, DEVELOPER_KEY))
        data = json.loads(r.text)
        #print(r.text)
        
        print('videos_id_list_from_channel_id(): channel:', n,'/', len(id_channel))
            
        totalVideo = data['pageInfo']['totalResults']
        
        nbIter = ceil(totalVideo/20)
        
        print('videos_id_list_from_channel_id(): totalvideo=',totalVideo,'nbiter=',nbIter)
        
        print('videos_id_list_from_channel_id(): page: 1 /', nbIter)
        
        for items in data['items']: 
            vid_list += [items['id']['videoId']]
            
        if 'nextPageToken' in data:
            nextPage = data['nextPageToken']
            for i in range(nbIter-1):
                print('videos_id_list_from_channel_id(): page:', i+1 ,'/', nbIter)
                r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&maxResults=20&type=video&channelId={}&pageToken={}&key={}'.format(channel_id, nextPage, DEVELOPER_KEY))
                data = json.loads(r.text)
                if 'nextPageToken' in data:
                    nextPage = data['nextPageToken']
                    if len(data['items'])<1:
                        print('ERROR: no items')
                    for items in data['items']:
                        vid_list += [items['id']['videoId']]
    return vid_list

#----------------------------------------------------------------------------------------
# CHANNEL REQUEST
# get the informations of the channel (only number of subscribers)
# id_list : list of the ids of the channels we want to get the informations from
# db      : the channel dataframe where we will add the videos informations
#----------------------------------------------------------------------------------------
def channel_request(id_list,db):
    
    nbr_channels = len(id_list)
    nbr_requests = int(np.ceil(nbr_channels/CHANNEL_REQUEST_ID_LIMIT))
    
    print('channel_request(): number of channels:' , nbr_channels)
    print('channel_request(): number of requests:' , nbr_requests)
    
    for i in range(nbr_requests):
        print('channel_request(): request:' , i+1, '/',nbr_requests)

        limit = CHANNEL_REQUEST_ID_LIMIT
        if limit > nbr_channels:
            limit = nbr_channels
            
        nbr_channels -= CHANNEL_REQUEST_ID_LIMIT
    
        #concatenate the videos ids
        req_id = id_list[i*CHANNEL_REQUEST_ID_LIMIT]
        for j in range(limit-1):
            req_id += ','+ id_list[i*CHANNEL_REQUEST_ID_LIMIT+j+1]
        
        url = 'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={}&key={}'.format(req_id, DEVELOPER_KEY)

        r = requests.get(url);
    
        #print(r.text) #if we want to print the json response
    
        data = json.loads(r.text)

        for item in data['items']:
            serie = dict()
            serie['id'] = item['id']
            serie['subscriberCount'] = item['statistics']['subscriberCount']
            db = db.append(serie, ignore_index=True)
    return db;

#----------------------------------------------------------------------------------------
# SEARCH VIDEOS
# Return a list of videos IDs based on a list of search terms
# q     : list of search terms
# limit : number of videos to retreive for each search term
#----------------------------------------------------------------------------------------
def search_videos(q,limit=500):
    
    vid_list = [];
    print('search_videos(): number of videos:',len(q))
    
    for n in range (len(q)):

        print('search_videos(): request: ',n,'/',len(q))
          
        channel_id = q[n]
        request0 = 0

        r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&order=relevance&relevanceLanguage=FR&relevanceLanguage=FR&safeSearch=none&maxResults=50&type=video&q={}&key={}'.format(channel_id, DEVELOPER_KEY))
        data = json.loads(r.text)
        nbIter = ceil(limit/50)
        
        print('search_videos(): page: 1/', nbIter)
            
        if 'items' in data: 
            for items in data['items']: 
                vidId =items['id']['videoId']
                vid_list += [vidId]
        else:
            print(r.text)
                    
        if 'nextPageToken' in data:
            nextPage = data['nextPageToken']

            for i in range(nbIter-1):
                print('search_videos(): page:', i+1 ,'/', nbIter)
                r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&order=relevance&relevanceLanguage=FR&relevanceLanguage=FR&safeSearch=none&maxResults=50&type=video&q={}&pageToken={}&key={}'.format(channel_id, nextPage, DEVELOPER_KEY))
                data = json.loads(r.text)
                #print(r.text)
                if 'nextPageToken' in data:
                    nextPage = data['nextPageToken']
                    if len(data['items'])<1:
                        print('ERROR: no items')
                    for items in data['items']:
                        vid_list += [items['id']['videoId']]
                        
    return vid_list

Database creation

Warning! To use only if we want to create new sql database ! Do not use if you want to add the information to existing dataset !

Create a new empty dataset for: VIDEOS ID LIST


In [4]:
videos_list_database = pd.DataFrame(columns=['id'])

filename = os.path.join(folder, 'videos_list.sqlite')
videos_list_database.to_sql('videos_list', 'sqlite:///' + filename, if_exists='replace')

Create a new empty dataset for: VIDEOS


In [5]:
videos_database = pd.DataFrame(columns=['id', 'channelId', 'title', 'thumbnailUrl', 'viewCount', 'likeCount','dislikeCount','commentCount','subsCount'])

filename = os.path.join(folder, 'videos.sqlite')
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')

Get a list of videos IDs

Get a list of videos IDs based on the search terms stored in the searches.txt file.

Add the retrieved IDs to the videos_id_list database.

The duplicates are deleted.


In [6]:
start_time = timeit.default_timer()
    
#get the existing database
videos_list_database = pd.read_sql('videos_list', 'sqlite:///' + os.path.join(folder, 'videos_list.sqlite'), index_col='index')

#convert to a list of str
vid_list = videos_list_database['id'].tolist()

#get the searches requests from the file
searches_list = [line.rstrip('\n') for line in open('searches.txt')]

print('Number of search terms: ', len(searches_list))
print('Number of video per search: ', video_per_search)
print('Total number of videos expected: ', len(searches_list)*video_per_search)

#get a list of videos IDs based on the search terms
searched_videos_list = search_videos(searches_list, video_per_search)

#add the new list of IDs to the existing one
vid_list += searched_videos_list

print('Number of video IDs we received: ', len(searched_videos_list))
print('Number of video IDs in the database: ', len(vid_list))

#delete the duplicates
vid_list=list(set(vid_list))

print('Number of video IDs in the database without duplicates: ', len(vid_list))

#store the videos IDs into a dataframe
videos_list_database = pd.DataFrame(vid_list,columns=['id'])

#store the dataframe in a sqlite database
filename = os.path.join(folder, 'videos_list.sqlite')
videos_list_database.to_sql('videos_list', 'sqlite:///' + filename, if_exists='replace')


stop_time = timeit.default_timer()

print('Time = ', np.ceil(stop_time - start_time), 'sec')


Number of search terms:  10
Number of video per search:  150
Total number of videos expected:  1500
search_videos(): number of videos: 10
search_videos(): request:  0 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  1 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  2 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  3 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  4 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  5 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  6 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  7 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  8 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
search_videos(): request:  9 / 10
search_videos(): page: 1/ 3
search_videos(): page: 1 / 3
search_videos(): page: 2 / 3
Number of video IDs we received:  1500
Number of video IDs in the database:  1500
Number of video IDs in the database without duplicates:  1381
Time =  12.0 sec

Get the videos informations

Get the information of the videos listed on the video_id_list_database, except the number of subscribers. We can not get the number of subscribers directly, we will get it on the next step.

Information retreived:

  • Channel ID
  • Video title
  • Thumbnail URL (120x90)
  • View count
  • Like count
  • Dislike count
  • Comment count

In [7]:
start_time = timeit.default_timer()

#get the "videos_id_list" and "videos" database 
videos_list_database = pd.read_sql('videos_list', 'sqlite:///' + os.path.join(folder, 'videos_list.sqlite'), index_col='index')
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')

#create a list of videos IDs from the database
video_list = videos_list_database['id'].tolist()

print('Number of video IDs: ', len(vid_list))

#delete the duplicates
video_list=list(set(video_list))

print('Number of video IDs without duplicates: ', len(vid_list))

#get the informations of the videos
videos_database= videos_request(vid_list,videos_database);

print('Number of videos in the database: ', len(videos_database))

#We delete the duplicates from the dataframe and reset the index
videos_database = videos_database.drop_duplicates('title')
videos_database = videos_database.reset_index(drop=True)

print('Number of videos in the database without duplicates: ', len(videos_database))

#store the information into the database
filename = os.path.join(folder, 'videos.sqlite')
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')

stop_time = timeit.default_timer()

print('Time = ', np.ceil(stop_time - start_time), 'sec')


Number of video IDs:  1381
Number of video IDs without duplicates:  1381
videos request(): number of videos: 1381
videos request(): number of requests: 28
videos request(): request: 1 / 28
videos request(): request: 2 / 28
videos request(): request: 3 / 28
videos request(): request: 4 / 28
videos request(): request: 5 / 28
videos request(): request: 6 / 28
videos request(): request: 7 / 28
videos request(): request: 8 / 28
videos request(): request: 9 / 28
videos request(): request: 10 / 28
videos request(): request: 11 / 28
videos request(): request: 12 / 28
videos request(): request: 13 / 28
videos request(): request: 14 / 28
videos request(): request: 15 / 28
videos request(): request: 16 / 28
videos request(): request: 17 / 28
videos request(): request: 18 / 28
videos request(): request: 19 / 28
videos request(): request: 20 / 28
videos request(): request: 21 / 28
videos request(): request: 22 / 28
videos request(): request: 23 / 28
videos request(): request: 24 / 28
videos request(): request: 25 / 28
videos request(): request: 26 / 28
videos request(): request: 27 / 28
videos request(): request: 28 / 28
Number of videos in the database:  1381
Number of videos in the database without duplicates:  1377
Time =  20.0 sec

Get the number of subscribers

Get the number of subscribers of the channel hosting the video. This step has to be done separately because we need to get the channel information to retrieve the number of subscribers.


In [8]:
start_time = timeit.default_timer()

#get the videos database
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')

#create a dataframe to store the number of subs
channels_database = pd.DataFrame(columns=['id','subscriberCount'])

#create a list of channels IDs
channels_list = videos_database['channelId'].tolist()
                                
#delete the duplicates
channels_list=list(set(channels_list))

print('Number of channels:',len(channels_list))
 
#get the number of subscribers for each channel
channels_database= channel_request(channels_list,channels_database);

#just to be sure we delete the duplicates
channels_database = channels_database.drop_duplicates('id')
channels_database = channels_database.reset_index(drop=True)

channelsNotFound = 0    
    
#for each video the get the subs count
for i in range(len(videos_database)):
    channelId = videos_database['channelId'][i]
    ans = channels_database.query("id == @channelId")
    
    if not i%100:
           print('Sub counter update: ',i,'/',len(videos_database))
           
    #if we found the channel
    if len(ans) == 1:
        videos_database['subsCount'][i] = int(ans['subscriberCount'])
    else:
        videos_database['subsCount'][i] = 0
        channelsNotFound += 1
         
print('Videos without correct subs count :',channelsNotFound)

#store the updated database
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')

stop_time = timeit.default_timer()

print('Time = ', np.ceil(stop_time - start_time), 'sec')

#display(channels_database)


Number of channels: 963
channel_request(): number of channels: 963
channel_request(): number of requests: 20
channel_request(): request: 1 / 20
channel_request(): request: 2 / 20
channel_request(): request: 3 / 20
channel_request(): request: 4 / 20
channel_request(): request: 5 / 20
channel_request(): request: 6 / 20
channel_request(): request: 7 / 20
channel_request(): request: 8 / 20
channel_request(): request: 9 / 20
channel_request(): request: 10 / 20
channel_request(): request: 11 / 20
channel_request(): request: 12 / 20
channel_request(): request: 13 / 20
channel_request(): request: 14 / 20
channel_request(): request: 15 / 20
channel_request(): request: 16 / 20
channel_request(): request: 17 / 20
channel_request(): request: 18 / 20
channel_request(): request: 19 / 20
channel_request(): request: 20 / 20
Sub counter update:  0 / 1377
Sub counter update:  100 / 1377
Sub counter update:  200 / 1377
Sub counter update:  300 / 1377
Sub counter update:  400 / 1377
Sub counter update:  500 / 1377
Sub counter update:  600 / 1377
Sub counter update:  700 / 1377
Sub counter update:  800 / 1377
Sub counter update:  900 / 1377
Sub counter update:  1000 / 1377
Sub counter update:  1100 / 1377
Sub counter update:  1200 / 1377
Sub counter update:  1300 / 1377
Videos without correct subs count : 0
Time =  25.0 sec

In [9]:
#display the videos database
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')
display(videos_database)


id channelId title thumbnailUrl viewCount likeCount dislikeCount commentCount subsCount
index
0 E6rPseeo5II UCJEClyq6bcqHu5lfH4DlPqA LE STEELWRAITH A EU UN ACCIDENT! :( https://i.ytimg.com/vi/E6rPseeo5II/default.jpg 43360 2249 33 255 412097
1 _o-fVGeTeAg UCxupBaSBeLbecd56fbxqqZg PILOTE D'HELICOPTÈRE ! - GMOD DARKRP FR #18 https://i.ytimg.com/vi/_o-fVGeTeAg/default.jpg 49454 1131 88 256 23968
2 E7xbiF7w3_Y UCCvhE7Q2nK7ZVPXXaB6sgTA Mathilde, 16 ans... pilote de rallye! https://i.ytimg.com/vi/E7xbiF7w3_Y/default.jpg 90209 184 12 42 7650
3 H3xp7AXqYcM UCOiGM7A-QDGh8Bff-zADaBw La pluie, l'eau et la voiture électrique : y a... https://i.ytimg.com/vi/H3xp7AXqYcM/default.jpg 2948 11 0 0 475
4 3xK1VAh1qAM UChVO0jyCJBg0yl7NNml256g F1 2015 | PS4 | 1 | #3 | Toro Rosso | Max Vers... https://i.ytimg.com/vi/3xK1VAh1qAM/default.jpg 18013 78 10 22 1615
5 QH48BfOf_u4 UC3GMy4mA3U0yORr31_lx3Tw JE REGARDE VOS WHEELING #3 (Crash, Bavette & D... https://i.ytimg.com/vi/QH48BfOf_u4/default.jpg 83010 4758 51 466 268955
6 BT5vDjNtg_w UCrueGrv5evUa_DYl6yLAu1g Voiture électrique 100% marocaine - MARRAKECH... https://i.ytimg.com/vi/BT5vDjNtg_w/default.jpg 8168 72 3 18 0
7 FV06fcgxYPs UCPXaDbZBqx1YDtoZ2VX4Isw Crash David Cronenberg 1996 Fr https://i.ytimg.com/vi/FV06fcgxYPs/default.jpg 802497 786 157 31 2459
8 36GmqQ2QZbM UCtS0JcoBgAIEjmifiip8IJg Why You Have To Go On A Car Rally! | Rally Kin... https://i.ytimg.com/vi/36GmqQ2QZbM/default.jpg 83362 2140 67 503 562204
9 sNgyf1DfzFI UC_ThZgHTq4tVIQMr6Gb6A1w Best of Rally 2015 - MAX ATTACK https://i.ytimg.com/vi/sNgyf1DfzFI/default.jpg 1467262 5589 169 350 3604
10 N1k9DxxhuHE UCKpRJZK14ZH7ffHtXMwPcTg Unknown Movie - (PILOTE) "The Brave" https://i.ytimg.com/vi/N1k9DxxhuHE/default.jpg 37288 1135 14 99 175591
11 iIQZmeEZfVI UCBscj5RB4DddePZcfajXQEg ✔ Dessins animé voiture. Voiture de police pou... https://i.ytimg.com/vi/iIQZmeEZfVI/default.jpg 210517 93 71 10 9643
12 wveDtHWxvN8 UCcSs5zORnEPT6q7Qsw_a8DA Circuit para baile sorpresa https://i.ytimg.com/vi/wveDtHWxvN8/default.jpg 232075 1192 49 53 284
13 yqQwH_Qj1x0 UCwLhmyAenL3yfWPYi9yUQog This is Rally 4 | The best scenes of Rallying ... https://i.ytimg.com/vi/yqQwH_Qj1x0/default.jpg 10172558 35914 1681 2719 38443
14 _RtqtrDl4U0 UC1MMoNpq6tF4skdFs4uhFng DEVENIR PILOTE DE CHASSE https://i.ytimg.com/vi/_RtqtrDl4U0/default.jpg 95526 213 21 22 3582
15 bkkzRXTH1OU UCoj6RxIAQq8kmJme-5dnN0Q LED Light Controller Short-circuit (can burn d... https://i.ytimg.com/vi/bkkzRXTH1OU/default.jpg 281 28 1 9 3402
16 ZlML_3s9QvA UC6TLCeiUSRa5PYM3AlYhkKg Conduire une voiture - Comment démarrer ? https://i.ytimg.com/vi/ZlML_3s9QvA/default.jpg 810126 9028 552 420 0
17 kgbrGWDmVtE UCH-5RHGmmZpLsTi-BBkG7Vw Ford Fiesta WRC (SL Rally Evo) https://i.ytimg.com/vi/kgbrGWDmVtE/default.jpg 5443 127 12 43 4389
18 8gj48WJu5Vo UCBN6kZNMvSj-7f57MLb-fNA JE CREE UNE COURSE TROLL RAGEANTE - GTA 5 ONLINE https://i.ytimg.com/vi/8gj48WJu5Vo/default.jpg 5312 567 25 131 205488
19 QRuNmUYcPio UCFEMbt9ck3r4T8rzyehsOFA Course de côte Moto, Quad et Karting, Le CARBE... https://i.ytimg.com/vi/QRuNmUYcPio/default.jpg 85685 258 27 13 852
20 XdiNwxR4a84 UCTlQW7r2D2S-4c3V-wlDunQ RALLY EN LA NIEVE!! https://i.ytimg.com/vi/XdiNwxR4a84/default.jpg 38934 2066 75 281 748908
21 iGtbyP579sQ UCP6sXGdUAy6NFo8V5zu1wqg La voiture électrique autonome de Diffuselec https://i.ytimg.com/vi/iGtbyP579sQ/default.jpg 1577 3 0 0 143
22 nd0MPMNa4IE UCOM2tOkZmjgRaD7BAQyanjQ Le métier de pilote de ligne par stéphan https://i.ytimg.com/vi/nd0MPMNa4IE/default.jpg 90082 164 3 27 1731
23 ie30-nTcyOA UCUphEIU9QlW78XsAdU8Lc8g Shocking Moment a Motorcycle Riders Crashes & ... https://i.ytimg.com/vi/ie30-nTcyOA/default.jpg 8894632 24494 5585 2922 83826
24 RE7BDpIdM08 UC5YlF1SfXpeEld2vkjK5-sQ GTA ONLINE - Course de la Mort https://i.ytimg.com/vi/RE7BDpIdM08/default.jpg 594723 6823 443 756 633239
25 2ppNvfZnCRw UCe2_2k-_81Y4RNKBnlMjrLg Métier de la Mer - Portrait d'un pilote de port https://i.ytimg.com/vi/2ppNvfZnCRw/default.jpg 13155 15 2 1 5439
26 o02s_g5AUUE UCB_qr75-ydFVKSF9Dmo6izg Your Favourite Belgian Grand Prix - 1998 Chaos... https://i.ytimg.com/vi/o02s_g5AUUE/default.jpg 1398394 7660 94 726 271628
27 JXA6H_L8KuM UC6vlFQPsP4j9dBkKFryqhgw MA FAMILLE A EU UN ACCIDENT... https://i.ytimg.com/vi/JXA6H_L8KuM/default.jpg 103919 7097 713 2623 253209
28 bYIqhVok2MA UCaRN6-qfVELTrTZYY05KZiQ UNE TONNE DE JOUETS • Récolte de la Course aux... https://i.ytimg.com/vi/bYIqhVok2MA/default.jpg 211223 1603 267 548 734013
29 rnGTAXJxxsE UCcd0a81G0Uv279pQYPBEGFw Funny Fishing Accident Compilation https://i.ytimg.com/vi/rnGTAXJxxsE/default.jpg 10091 12 3 2 7
... ... ... ... ... ... ... ... ... ...
1347 e9vFO3cS2NM UC3YVzjYBLqIdfgxt8Dki0Bg Accident de voiture mortel en direct - Caméra ... https://i.ytimg.com/vi/e9vFO3cS2NM/default.jpg 45491 31 18 7 2355
1348 fYw8JH2WMAU UClY52uuyYdsF3embOxNoSbg Pilote G 741 C: un camping-car intégral super ... https://i.ytimg.com/vi/fYw8JH2WMAU/default.jpg 4254 5 0 0 221
1349 AFIpflYqgzU UCR5VdjpClw2-amJobJaDq0w Racing and Rally Crash Compilation Week 34 Aug... https://i.ytimg.com/vi/AFIpflYqgzU/default.jpg 100676 575 24 73 25761
1350 kfCRNBoV_DU UCMjG44f_717KnHQRwmAu1iA Audi IDIOT Drivers, EPIC AUDI Driving Fails, C... https://i.ytimg.com/vi/kfCRNBoV_DU/default.jpg 195128 218 75 75 2362
1351 4z8H7d54FSY UC1WzEVwp7a4PNJHAnnkO6Kw 5 GRAVES ACCIDENTS AUX JEUX OLYMPIQUES ! https://i.ytimg.com/vi/4z8H7d54FSY/default.jpg 812324 2350 716 267 29953
1352 sWzjDhq8q0c UCWhcacWv3wXH_Fkp_qolP5A Top 23 Airplane Crash fatal Compilation 2016 ◆... https://i.ytimg.com/vi/sWzjDhq8q0c/default.jpg 832082 886 324 257 33251
1353 VmgS4mUxCE4 UCpg3_r57Mh2GlAxY9lSl6Ig Stage Forumle 3 et Formule 1 - LRS Formula https://i.ytimg.com/vi/VmgS4mUxCE4/default.jpg 35777 24 4 2 47531
1354 yPN6_e2xVBI UC5Cj_jenWEKwe_ukcQAmVqQ 8 Craziest Group B Rally Cars https://i.ytimg.com/vi/yPN6_e2xVBI/default.jpg 15029 246 14 71 31274
1355 TLfYc7HPuzY UClz5XQbGTv2VMIJGPrtM5SQ ce mec est un sacre pilote de moto https://i.ytimg.com/vi/TLfYc7HPuzY/default.jpg 75607 255 62 66 49
1356 3BnLDSJFcwA UCbv4qEAknXjTlbMT1voytWg Car Crash) very Shock dash camera 2017 NEW By ... https://i.ytimg.com/vi/3BnLDSJFcwA/default.jpg 72305 63 65 12 0
1357 10vobhwuxaI UCCo921pfh8pX1LnzMGI33Ag Formule 1 2017 teams getting ready https://i.ytimg.com/vi/10vobhwuxaI/default.jpg 123224 285 53 61 1823
1358 6B8ItvtgBSg UCigJa_VqAvkQSsQtKNvNhOw Zombie Snowflakes crash constitutional carry r... https://i.ytimg.com/vi/6B8ItvtgBSg/default.jpg 2499 110 3 74 7290
1359 dDCC-kshcgk UCvhgQhppAXQ3gvQS5vfnh0g Barry Sheene : La légende du N° 7 Pilote moto GP https://i.ytimg.com/vi/dDCC-kshcgk/default.jpg 11042 38 1 2 742
1360 ywYm3teQ7NY UCTKTXsGuEF5-qmqLV4eSe0g C'est Canteloup - La voiture électrique d'Arna... https://i.ytimg.com/vi/ywYm3teQ7NY/default.jpg 12820 85 0 6 4943
1361 QeQLSTQLZ30 UCIMGfEAERXjmWwQeg15BFsg Nicolas Canteloup - François Hollande, pilote ... https://i.ytimg.com/vi/QeQLSTQLZ30/default.jpg 10575 45 2 7 70234
1362 PAjH0S8AroY UCttspZesZIDEwwpVIgoZtWQ Patna Boat Accident: Survivors of the accident... https://i.ytimg.com/vi/PAjH0S8AroY/default.jpg 879 3 1 2 1309992
1363 TBxWkIZeG1w UC6lY5Ls2kdhGJog2Zw7nUiw -Kaasua! 6- Finnish Rally Action 2016 https://i.ytimg.com/vi/TBxWkIZeG1w/default.jpg 81467 405 13 29 23040
1364 vfe-eNq-Qyg UCX6b17PVsYBQ0ip5gyeme-Q The Silk Road and Ancient Trade: Crash Course ... https://i.ytimg.com/vi/vfe-eNq-Qyg/default.jpg 2293078 26632 327 4224 5338807
1365 kH110yjYZ2g UCiqd3GLTluk2s_IBt7p_LjA #122: Electronic Circuit Construction Techniqu... https://i.ytimg.com/vi/kH110yjYZ2g/default.jpg 60341 1018 9 102 59879
1366 xQKD-5NTqB4 UCG8eeyqW9yG5C4ckZDT6cdw pilote de Mirage 2000 avion de chasse https://i.ytimg.com/vi/xQKD-5NTqB4/default.jpg 19703 68 3 24 792
1367 fipujlYCQm0 UCkrKW1_u0O8qSc3xsLoRHew TRAMPOLINE PARK SLIDE OBSTACLE COURSE (HARD MODE) https://i.ytimg.com/vi/fipujlYCQm0/default.jpg 3436024 75531 1079 5513 1944926
1368 KlpVZnwZ2Eg UCy23eRKH3BoOaf8wXGGYO0A accident formule 1 circuit nogaro https://i.ytimg.com/vi/KlpVZnwZ2Eg/default.jpg 1584 4 0 0 5
1369 p9Umk5aG8Y4 UCW8-pJyPdYAdubad1veFB3A Making a Circuit Board From Scratch https://i.ytimg.com/vi/p9Umk5aG8Y4/default.jpg 182394 3339 31 344 5455
1370 SjnLTFC1flk UCE7m81V_D45SCiUJoWJ602w I-41 crash 1/10/17 Public https://i.ytimg.com/vi/SjnLTFC1flk/default.jpg 107700 189 27 68 28
1371 mMxLzaDuWOE UCIUEPjXOuCvZwNhM-V1-9PQ Calculating Current in a Parallel Circuit.mov https://i.ytimg.com/vi/mMxLzaDuWOE/default.jpg 361133 1402 97 217 794
1372 BiP0QNj4dlg UCZO3HO3Tki8mQ3eBYD8ISQw Quand le pilote paie sa place dans l’avion - E... https://i.ytimg.com/vi/BiP0QNj4dlg/default.jpg 16796 42 6 8 2036
1373 c2zGYu-l7t0 UC5rBpVgv83gYPZ593XwQUsA Ice Driving in 911 Rally Cars - /CHRIS HARRIS ... https://i.ytimg.com/vi/c2zGYu-l7t0/default.jpg 441214 3783 71 486 1735758
1374 hbQK1sgdb7M UCVXTRivkuNxfu-MLQbTt0gA USA Crazy Driver & Car Crash Compilation Episo... https://i.ytimg.com/vi/hbQK1sgdb7M/default.jpg 33872 179 69 83 10205
1375 Lt9M_tSij_4 UC1mMYyv7J2JjmdRE-a-jd5A Truck DASH CAM Crash ★ DASH CAM Truck Road Acc... https://i.ytimg.com/vi/Lt9M_tSij_4/default.jpg 16782 73 11 10 0
1376 Rr1YAqnd9P0 UCcOvNM7TC71yEDUUd7zMxcQ [HD] ✈ VLOG - Présentation de mon parcours de ... https://i.ytimg.com/vi/Rr1YAqnd9P0/default.jpg 2484 118 2 45 857

1377 rows × 9 columns

Get the images database

From the url of the thumbnails, we load the images and then we save them in a database for the CNN algorithm. It uses PIL, which is not installed on our dockers. But we can run this cell on Jupyter.


In [ ]:
from PIL import Image 

for i in range(len(videos_database['thumbnailUrl'])): #len(videos_database['thumbnailUrl'])
    f = open('Image3/'+videos_database['id'][i]+'.png','wb')
    f.write(requests.get(videos_database['thumbnailUrl'][i]).content)
    f.close()

imag = pd.DataFrame(columns=['imag'])
for i in range(len(videos_database['thumbnailUrl'])):
    serie = dict(imag=imag['imag'])
    img = Image.open('Image3/'+videos_database['id'][i]+'.png')
    p = np.array(img,'float32')
    serie['imag'] = p.flatten()
    imag=imag.append(serie,ignore_index=True)
    
filename = os.path.join(folder, 'imag.sqlite')
imag.to_sql('imag', 'sqlite:///' + filename, if_exists='replace')
print('done')