Github: https://github.com/min2bro/WebScraping/blob/master/WebScrapingnDataAnalysis.ipynb
Twitter: @min2bro
If you want to follow along, this is a notebook that you can view or run yourself:
All materials (notebook, data, link to nbviewer): https://github.com/min2bro/WebScraping/blob/master/WebScrapingnDataAnalysis.ipynb
You need pandas > 0.15 (easy solution is using Anaconda)
In [219]:
%matplotlib inline
from selenium import webdriver
import time,re,json,numpy as np
import pandas as pd
from collections import defaultdict,Counter
import matplotlib.pyplot as plt
In [220]:
url = "http://www.imdb.com/list/ls061683439/"
with open('./img/filmfare.json',encoding="utf-8") as f:
datatbl = json.load(f)
driver = webdriver.Chrome(datatbl['data']['chromedriver'])
driver.get(url)
In [221]:
def ExtractText(Xpath):
textlist=[]
if(Xpath=='Movies_Director_Xpath'):
for item in range(1,123,2):
textlist.append(driver.find_element_by_xpath(datatbl['data'][Xpath]+'[%d]'%item).text)
else:
[textlist.append(item.text) for item in driver.find_elements_by_xpath(datatbl['data'][Xpath])]
return textlist
In [222]:
#Extracting Data from Web
Movies_Votes,Movies_Name,Movies_Ratings,Movies_RunTime=[[] for i in range(4)]
datarepo = [[]]*5
Xpath_list = ['Movies_Name_Xpath','Movies_Rate_Xpath','Movies_Runtime_Xpath','Movies_Votes_Xpath',
'Movies_Director_Xpath']
for i in range(5):
if(i==3):
driver.find_element_by_xpath(datatbl['data']['listview']).click()
if(i==4):
driver.find_element_by_xpath(datatbl['data']['detailview']).click()
datarepo[i] = ExtractText(Xpath_list[i])
driver.quit()
In [223]:
# Movie Name List & Ratings
print(datarepo[0][:5])
print("")
print(datarepo[3][:5])
In [224]:
# Result in a Python Dictionary
Years=range(2015,1954,-1)
result = defaultdict(dict)
for i in range(0,len(datarepo[0])):
result[i]['Movie Name']= datarepo[0][i]
result[i]['Year']= Years[i]
result[i]['Rating']= datarepo[1][i]
result[i]['Votes']= datarepo[3][i]
result[i]['RunTime']= datarepo[2][i]
result[i]['Genre']= datatbl['data']['Genre'][i]
result[i]['Director']= datarepo[4][i]
In [225]:
import pprint as pp
pp = pprint.PrettyPrinter(depth=10)
pp.pprint(result[0])
In [226]:
print(json.dumps((result[0]),indent=4))
In [227]:
for key,values in result.items():
values['Votes'] = int(values['Votes'].replace(",",""))
values['Rating']= float(values['Rating'])
values['Director']= values['Director'].replace('Director: ','')
try:
values['RunTime'] = int(re.findall(r'\d+',values['RunTime'])[-1])
except TypeError:
values['RunTime'] = np.NaN
except IndexError:
values['RunTime'] = np.NaN
In [228]:
print(json.dumps((result[0]),indent=4))
In [229]:
# create dataframe
df = pd.DataFrame.from_dict(result,orient='index')
df = df[['Year', 'Movie Name', 'Rating', 'Votes','Genre','RunTime','Director']]
df.index = np.arange(1, 62)
df.head(10)
Out[229]:
In [230]:
nans = df.shape[0] - df.dropna().shape[0]
print('%d rows have missing values' % nans)
In [231]:
df=df.fillna(int(df['RunTime'].mean()))
df[0:5]
Out[231]:
In [15]:
df.info()
In [232]:
#Highest Rating Movies
df1=df.sort_values('Rating',ascending=[False]).head(5)
df1.index = np.arange(1, 6)
df1
Out[232]:
In [233]:
df.plot(x=df.Year,y=['Rating']);
In [234]:
df1=df.sort_values('Rating',ascending=[True]).head(5)
df1.index = np.arange(1, 6)
df1
Out[234]:
In [235]:
#Movies with maximum Run Time
df1=df.sort_values(['RunTime'],ascending=[False]).head(10)
df1.index = np.arange(1, 11)
df1
Out[235]:
In [236]:
df.plot(x=df.Year,y=['RunTime']);
In [237]:
df['RunTime'].mean()
Out[237]:
In [238]:
df[(df['Rating']>=7)]['Rating'].count()
Out[238]:
In [240]:
Rating_Histdic = defaultdict(dict)
Rating_Histdic['Btwn 6&7'] = df[(df['Rating']>=6)&(df['Rating']<7)]['Rating'].count()
Rating_Histdic['GTEQ 8'] = df[(df['Rating']>=8)]['Rating'].count()
Rating_Histdic['Btwn 7 & 8'] = df[(df['Rating']>=7)&(df['Rating']<8)]['Rating'].count()
plt.bar(range(len(Rating_Histdic)), Rating_Histdic.values(), align='center',color='brown',width=0.4)
plt.xticks(range(len(Rating_Histdic)), Rating_Histdic.keys(), rotation=25);
In [241]:
Rating_Hist = []
import numpy as np
Rating_Hist.append(Rating_Histdic['Btwn 6&7'])
Rating_Hist.append(Rating_Histdic['GTEQ 8'])
Rating_Hist.append(Rating_Histdic['Btwn 7 & 8'])
labels = ['Btwn 6&7', 'GTEQ 8', 'Btwn 7 & 8']
colors = ['red', 'orange', 'green']
plt.pie(Rating_Hist,labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=90);
In [25]:
Category=Counter(datatbl['data']['Genre'])
df1 = pd.DataFrame.from_dict(Category,orient='index')
df1 = df1.sort_values([0],ascending=[False]).head(5)
df1.plot(kind='barh',color=['g','c','m']);
In [270]:
df['freq']= df.groupby('Director')['Director'].transform('count')
df2=df[df['freq']>1]
del df2['freq']
In [276]:
df2.groupby(['Director','Year', 'Movie Name',
'Rating', 'Genre','Votes','RunTime']).count()[0:8]
Out[276]:
In [288]:
import pandas as pd
data = 'Director: Satyen Bose'
data.replace('Director: ','')
Out[288]:
In [201]:
headers = ['Movie Name','Run Time','Rating','Actor','Actor1']
pd.DataFrame([data],columns=headers)
Out[201]:
In [138]:
df.plot(x=df.Year,y=['RunTime']);
In [ ]: