Github: https://github.com/min2bro/WebScrapingwithSelenium
Twitter: @min2bro
In [31]:
%matplotlib inline
from selenium import webdriver
import os,time,json
import pandas as pd
from collections import defaultdict,Counter
import matplotlib.pyplot as plt
In [32]:
url = "http://www.imdb.com/list/ls061683439/"
with open('./img/filmfare.json',encoding="utf-8") as f:
datatbl = json.load(f)
driver = webdriver.Chrome(datatbl['data']['chromedriver'])
driver.get(url)
In [33]:
def ExtractText(Xpath):
textlist=[]
if(Xpath=="Movies_Runtime_Xpath"):
[textlist.append(item.text[-10:-7]) for item in driver.find_elements_by_xpath(datatbl['data'][Xpath])]
else:
[textlist.append(item.text) for item in driver.find_elements_by_xpath(datatbl['data'][Xpath])]
return textlist
In [34]:
#Extracting Data from Web
Movies_Votes,Movies_Name,Movies_Ratings,Movies_RunTime=[[] for i in range(4)]
datarepo = [[]]*4
Xpath_list = ['Movies_Name_Xpath','Movies_Rate_Xpath','Movies_Runtime_Xpath','Movies_Votes_Xpath']
for i in range(4):
if(i==3):
driver.find_element_by_xpath(datatbl['data']['listview']).click()
datarepo[i] = ExtractText(Xpath_list[i])
driver.quit()
In [35]:
# Movie Name List & Ratings
print(datarepo[0][:5])
print(datarepo[3][:5])
In [36]:
# Result in a Python Dictionary
Years=range(2015,1954,-1)
result = defaultdict(dict)
for i in range(0,len(datarepo[0])):
result[i]['Movie Name']= datarepo[0][i]
result[i]['Year']= Years[i]
result[i]['Rating']= datarepo[1][i]
result[i]['Votes']= datarepo[3][i]
result[i]['RunTime']= datarepo[2][i]
In [37]:
result
Out[37]:
In [38]:
print(json.dumps(result[58], indent=2))
In [39]:
for key,values in result.items():
values['Votes'] = int(values['Votes'].replace(",",""))
values['Rating']= float(values['Rating'])
try:
values['RunTime'] = int(values['RunTime'])
except ValueError:
values['RunTime'] = 154
In [40]:
result[58]
Out[40]:
In [55]:
# create dataframe
df = pd.DataFrame.from_dict(result,orient='index')
df = df[['Year', 'Movie Name', 'Rating', 'Votes','RunTime']]
df
Out[55]:
In [74]:
df.info()
In [48]:
#Highest Rating Movies
df.sort_values('Rating',ascending=[False]).head(5)
Out[48]:
In [49]:
#Movies with maximum Run Time
df.sort_values(['RunTime'],ascending=[False]).head(10)
Out[49]:
In [75]:
df.plot(x=df.Year,y=['RunTime']);
In [48]:
df['RunTime'].mean()
Out[48]:
In [76]:
df[(df['Rating']>=7)]['Rating'].count()
Out[76]:
In [77]:
Rating_Histdic = defaultdict(dict)
Rating_Histdic['Btwn 6&7'] = df[(df['Rating']>=6)&(df['Rating']<7)]['Rating'].count()
Rating_Histdic['GTEQ 8'] = df[(df['Rating']>=8)]['Rating'].count()
Rating_Histdic['Btwn 7 & 8'] = df[(df['Rating']>=7)&(df['Rating']<8)]['Rating'].count()
plt.bar(range(len(Rating_Histdic)), Rating_Histdic.values(), align='center',color='brown',width=0.4)
plt.xticks(range(len(Rating_Histdic)), Rating_Histdic.keys(), rotation=25);
In [78]:
Rating_Hist = []
import numpy as np
Rating_Hist.append(Rating_Histdic['Btwn 6&7'])
Rating_Hist.append(Rating_Histdic['GTEQ 8'])
Rating_Hist.append(Rating_Histdic['Btwn 7 & 8'])
labels = ['Btwn 6&7', 'GTEQ 8', 'Btwn 7 & 8']
colors = ['red', 'orange', 'green']
plt.pie(Rating_Hist,labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90);
In [54]:
Category=Counter(datatbl['data']['Genre'])
df1 = pd.DataFrame.from_dict(Category,orient='index')
df1 = df1.sort_values([0],ascending=[False]).head(5)
df1.plot(kind='barh',color=['g','c','m']);