In this notebook we create our video database based on research terms.
The research terms have to be written in the "searches.txt" file, each new line is a new research. We can have as many searches as we want.
The number of videos we get for each search line is 50 by default but it can be modified in the "Get a list of videos IDs" section.
In [1]:
VIDEOS_REQUEST_ID_LIMIT = 50
CHANNEL_REQUEST_ID_LIMIT = 50
key1 = "KEY"
key2 = "KEY"
DEVELOPER_KEY = key2
import requests
import json
import pandas as pd
from math import *
import numpy as np
import tensorflow as tf
import time
import collections
import os
import timeit
from IPython.display import display
In [ ]:
#where the database will be stored
folder = os.path.join('sql_database')
#how many videos per search we want to get (max=500, multiple of 50)
video_per_search = 150
In [3]:
#----------------------------------------------------------------------------------------
# VIDEO REQUEST
# get the informations of the videos
# id_list : list of the ids of the videos we want to get the informations from
# db : the video dataframe where we will add the videos informations
#----------------------------------------------------------------------------------------
def videos_request(id_list,db):
nbr_videos = len(id_list)
#limit of request per "get"
nbr_requests = int(np.ceil(nbr_videos/VIDEOS_REQUEST_ID_LIMIT))
print('videos request(): number of videos:' , nbr_videos)
print('videos request(): number of requests:' , nbr_requests)
for i in range(nbr_requests):
print('videos request(): request:' , i+1, '/',nbr_requests)
limit = VIDEOS_REQUEST_ID_LIMIT
if limit > nbr_videos:
limit = nbr_videos
nbr_videos -= VIDEOS_REQUEST_ID_LIMIT
#concatenate the videos ids
req_id = id_list[i*VIDEOS_REQUEST_ID_LIMIT]
for j in range(limit-1):
req_id += ','+ id_list[i*VIDEOS_REQUEST_ID_LIMIT+j+1]
#we get "snippet" and "statistics"
url = 'https://www.googleapis.com/youtube/v3/videos?part=snippet,statistics&id={}&key={}'.format(req_id, DEVELOPER_KEY)
r = requests.get(url);
#print(r.text) #if we want to print the json response
data = json.loads(r.text)
# if we don't have any error we should have items
if 'items' in data:
for item in data['items']:
serie = dict()
serie['id'] = item['id']
serie['channelId'] = item['snippet']['channelId']
serie['title'] = item['snippet']['title']
serie['thumbnailUrl'] = item['snippet']['thumbnails']['default']['url']
serie['viewCount'] = item['statistics']['viewCount']
#sometimes those are not send for some reason, we check if there are in the response
if 'likeCount' in item['statistics']:
serie['likeCount'] = item['statistics']['likeCount']
else:
serie['likeCount'] = '0'
if 'dislikeCount' in item['statistics']:
serie['dislikeCount'] = item['statistics']['dislikeCount']
else:
serie['dislikeCount'] = '0'
if 'commentCount' in item['statistics']:
serie['commentCount'] = item['statistics']['commentCount']
else:
serie['commentCount'] = '0'
serie['subsCount'] = 'na'
db = db.append(serie, ignore_index=True)
else:
print('videos request(): error: no items in data')
#return the new dataframe
return db;
#----------------------------------------------------------------------------------------
# VIDEO ID LIST FROM CHANNEL ID
# get a list of videos from the channel IDs
# id channel : list of channel IDs
#----------------------------------------------------------------------------------------
def videos_id_list_from_channel_id(id_channel):
#prepare the videos list to return
vid_list = [];
print('videos_id_list_from_channel_id(): number of channels:', len(id_channel))
for n in range (len(id_channel)):
channel_id = id_channel[n]
r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&publishedBefore=2016-07-07T00:00:00Z&maxResults=20&type=video&channelId={}&key={}'.format(channel_id, DEVELOPER_KEY))
data = json.loads(r.text)
#print(r.text)
print('videos_id_list_from_channel_id(): channel:', n,'/', len(id_channel))
totalVideo = data['pageInfo']['totalResults']
nbIter = ceil(totalVideo/20)
print('videos_id_list_from_channel_id(): totalvideo=',totalVideo,'nbiter=',nbIter)
print('videos_id_list_from_channel_id(): page: 1 /', nbIter)
for items in data['items']:
vid_list += [items['id']['videoId']]
if 'nextPageToken' in data:
nextPage = data['nextPageToken']
for i in range(nbIter-1):
print('videos_id_list_from_channel_id(): page:', i+1 ,'/', nbIter)
r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&maxResults=20&type=video&channelId={}&pageToken={}&key={}'.format(channel_id, nextPage, DEVELOPER_KEY))
data = json.loads(r.text)
if 'nextPageToken' in data:
nextPage = data['nextPageToken']
if len(data['items'])<1:
print('ERROR: no items')
for items in data['items']:
vid_list += [items['id']['videoId']]
return vid_list
#----------------------------------------------------------------------------------------
# CHANNEL REQUEST
# get the informations of the channel (only number of subscribers)
# id_list : list of the ids of the channels we want to get the informations from
# db : the channel dataframe where we will add the videos informations
#----------------------------------------------------------------------------------------
def channel_request(id_list,db):
nbr_channels = len(id_list)
nbr_requests = int(np.ceil(nbr_channels/CHANNEL_REQUEST_ID_LIMIT))
print('channel_request(): number of channels:' , nbr_channels)
print('channel_request(): number of requests:' , nbr_requests)
for i in range(nbr_requests):
print('channel_request(): request:' , i+1, '/',nbr_requests)
limit = CHANNEL_REQUEST_ID_LIMIT
if limit > nbr_channels:
limit = nbr_channels
nbr_channels -= CHANNEL_REQUEST_ID_LIMIT
#concatenate the videos ids
req_id = id_list[i*CHANNEL_REQUEST_ID_LIMIT]
for j in range(limit-1):
req_id += ','+ id_list[i*CHANNEL_REQUEST_ID_LIMIT+j+1]
url = 'https://www.googleapis.com/youtube/v3/channels?part=statistics&id={}&key={}'.format(req_id, DEVELOPER_KEY)
r = requests.get(url);
#print(r.text) #if we want to print the json response
data = json.loads(r.text)
for item in data['items']:
serie = dict()
serie['id'] = item['id']
serie['subscriberCount'] = item['statistics']['subscriberCount']
db = db.append(serie, ignore_index=True)
return db;
#----------------------------------------------------------------------------------------
# SEARCH VIDEOS
# Return a list of videos IDs based on a list of search terms
# q : list of search terms
# limit : number of videos to retreive for each search term
#----------------------------------------------------------------------------------------
def search_videos(q,limit=500):
vid_list = [];
print('search_videos(): number of videos:',len(q))
for n in range (len(q)):
print('search_videos(): request: ',n,'/',len(q))
channel_id = q[n]
request0 = 0
r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&order=relevance&relevanceLanguage=FR&relevanceLanguage=FR&safeSearch=none&maxResults=50&type=video&q={}&key={}'.format(channel_id, DEVELOPER_KEY))
data = json.loads(r.text)
nbIter = ceil(limit/50)
print('search_videos(): page: 1/', nbIter)
if 'items' in data:
for items in data['items']:
vidId =items['id']['videoId']
vid_list += [vidId]
else:
print(r.text)
if 'nextPageToken' in data:
nextPage = data['nextPageToken']
for i in range(nbIter-1):
print('search_videos(): page:', i+1 ,'/', nbIter)
r=requests.get('https://www.googleapis.com/youtube/v3/search?part=id&order=relevance&relevanceLanguage=FR&relevanceLanguage=FR&safeSearch=none&maxResults=50&type=video&q={}&pageToken={}&key={}'.format(channel_id, nextPage, DEVELOPER_KEY))
data = json.loads(r.text)
#print(r.text)
if 'nextPageToken' in data:
nextPage = data['nextPageToken']
if len(data['items'])<1:
print('ERROR: no items')
for items in data['items']:
vid_list += [items['id']['videoId']]
return vid_list
Create a new empty dataset for: VIDEOS ID LIST
In [4]:
videos_list_database = pd.DataFrame(columns=['id'])
filename = os.path.join(folder, 'videos_list.sqlite')
videos_list_database.to_sql('videos_list', 'sqlite:///' + filename, if_exists='replace')
Create a new empty dataset for: VIDEOS
In [5]:
videos_database = pd.DataFrame(columns=['id', 'channelId', 'title', 'thumbnailUrl', 'viewCount', 'likeCount','dislikeCount','commentCount','subsCount'])
filename = os.path.join(folder, 'videos.sqlite')
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')
In [6]:
start_time = timeit.default_timer()
#get the existing database
videos_list_database = pd.read_sql('videos_list', 'sqlite:///' + os.path.join(folder, 'videos_list.sqlite'), index_col='index')
#convert to a list of str
vid_list = videos_list_database['id'].tolist()
#get the searches requests from the file
searches_list = [line.rstrip('\n') for line in open('searches.txt')]
print('Number of search terms: ', len(searches_list))
print('Number of video per search: ', video_per_search)
print('Total number of videos expected: ', len(searches_list)*video_per_search)
#get a list of videos IDs based on the search terms
searched_videos_list = search_videos(searches_list, video_per_search)
#add the new list of IDs to the existing one
vid_list += searched_videos_list
print('Number of video IDs we received: ', len(searched_videos_list))
print('Number of video IDs in the database: ', len(vid_list))
#delete the duplicates
vid_list=list(set(vid_list))
print('Number of video IDs in the database without duplicates: ', len(vid_list))
#store the videos IDs into a dataframe
videos_list_database = pd.DataFrame(vid_list,columns=['id'])
#store the dataframe in a sqlite database
filename = os.path.join(folder, 'videos_list.sqlite')
videos_list_database.to_sql('videos_list', 'sqlite:///' + filename, if_exists='replace')
stop_time = timeit.default_timer()
print('Time = ', np.ceil(stop_time - start_time), 'sec')
Get the information of the videos listed on the video_id_list_database, except the number of subscribers. We can not get the number of subscribers directly, we will get it on the next step.
Information retreived:
In [7]:
start_time = timeit.default_timer()
#get the "videos_id_list" and "videos" database
videos_list_database = pd.read_sql('videos_list', 'sqlite:///' + os.path.join(folder, 'videos_list.sqlite'), index_col='index')
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')
#create a list of videos IDs from the database
video_list = videos_list_database['id'].tolist()
print('Number of video IDs: ', len(vid_list))
#delete the duplicates
video_list=list(set(video_list))
print('Number of video IDs without duplicates: ', len(vid_list))
#get the informations of the videos
videos_database= videos_request(vid_list,videos_database);
print('Number of videos in the database: ', len(videos_database))
#We delete the duplicates from the dataframe and reset the index
videos_database = videos_database.drop_duplicates('title')
videos_database = videos_database.reset_index(drop=True)
print('Number of videos in the database without duplicates: ', len(videos_database))
#store the information into the database
filename = os.path.join(folder, 'videos.sqlite')
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')
stop_time = timeit.default_timer()
print('Time = ', np.ceil(stop_time - start_time), 'sec')
In [8]:
start_time = timeit.default_timer()
#get the videos database
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')
#create a dataframe to store the number of subs
channels_database = pd.DataFrame(columns=['id','subscriberCount'])
#create a list of channels IDs
channels_list = videos_database['channelId'].tolist()
#delete the duplicates
channels_list=list(set(channels_list))
print('Number of channels:',len(channels_list))
#get the number of subscribers for each channel
channels_database= channel_request(channels_list,channels_database);
#just to be sure we delete the duplicates
channels_database = channels_database.drop_duplicates('id')
channels_database = channels_database.reset_index(drop=True)
channelsNotFound = 0
#for each video the get the subs count
for i in range(len(videos_database)):
channelId = videos_database['channelId'][i]
ans = channels_database.query("id == @channelId")
if not i%100:
print('Sub counter update: ',i,'/',len(videos_database))
#if we found the channel
if len(ans) == 1:
videos_database['subsCount'][i] = int(ans['subscriberCount'])
else:
videos_database['subsCount'][i] = 0
channelsNotFound += 1
print('Videos without correct subs count :',channelsNotFound)
#store the updated database
videos_database.to_sql('videos', 'sqlite:///' + filename, if_exists='replace')
stop_time = timeit.default_timer()
print('Time = ', np.ceil(stop_time - start_time), 'sec')
#display(channels_database)
In [9]:
#display the videos database
videos_database = pd.read_sql('videos', 'sqlite:///' + os.path.join(folder, 'videos.sqlite'), index_col='index')
display(videos_database)
In [ ]:
from PIL import Image
for i in range(len(videos_database['thumbnailUrl'])): #len(videos_database['thumbnailUrl'])
f = open('Image3/'+videos_database['id'][i]+'.png','wb')
f.write(requests.get(videos_database['thumbnailUrl'][i]).content)
f.close()
imag = pd.DataFrame(columns=['imag'])
for i in range(len(videos_database['thumbnailUrl'])):
serie = dict(imag=imag['imag'])
img = Image.open('Image3/'+videos_database['id'][i]+'.png')
p = np.array(img,'float32')
serie['imag'] = p.flatten()
imag=imag.append(serie,ignore_index=True)
filename = os.path.join(folder, 'imag.sqlite')
imag.to_sql('imag', 'sqlite:///' + filename, if_exists='replace')
print('done')