In [ ]:
from selenium import webdriver

In [ ]:
df_help = pd.DataFrame(columns=['help'])
df_review= pd.DataFrame(columns=['review'])
df_date = pd.DataFrame(columns=['date'])
df_author = pd.DataFrame(columns=['author'])
df_star = pd.DataFrame(columns=['star'])

1.1) Amazon Fire


In [ ]:
driver = webdriver.Firefox()
for page in range(0, 3994):          
    driver.get("http://www.amazon.com/Fire-Display-Wi-Fi-GB-Includes/product-reviews/B00TSUGXKE/ref=cm_cr_arp_d_paging_btm_{page}\
    ?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':  # reviews with this message haven't yet received any feedback 
            helpp = [0, 0]                              # assign helpful score 0 to filter at preprocessing
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:      # reviews with this message contain number of feedbacks from customers
            helpp = list([int(s) for s in helps.split() if s.isdigit()]) # extract only numerical data
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0]) # number of' helpful' feedback from customers
df_help['n_total'] = df_help['help'].apply(lambda x: x[1]) # 'total' numer of feedbacks, either 'helpful' or 'unhelpful'
df_help['helpful score'] = df_help['n_help'] / df_help['n_total'] # helpful score will be in the range of 0 to 1

df = df_help[df_help["n_total"] >= 4] # filter reviews with less than 4 feedbacks from customers
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('fire_tablet_1500_200.csv', encoding='utf-8')

1.21) MS Surface


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 52):          
    driver.get("http://static.amazon.com/Microsoft-Surface-Tablet-10-8-Inch-Windows/product-reviews/B012DTDBI8/ref=cm_cr_getr_d_paging_btm_\
    {page}?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('surface_tablet.csv', encoding='utf-8')

1.22) MS Surface


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 30):          
    driver.get("http://static.amazon.com/Microsoft-Surface-Intel-Core-Windows/product-reviews/B00KHQWPZA/ref=cm_cr_getr_d_paging_btm_\
    {page}?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('surface_tablet2.csv', encoding='utf-8')

1.23) MS Surface


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 48):          
    driver.get("http://static.amazon.com/Microsoft-Surface-Pro-Intel-Core/product-reviews/B01606IDL0/ref=cm_cr_getr_d_paging_btm_{page}\
               ?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('surface_tablet3.csv', encoding='utf-8')

1.31) Galaxy Tab


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 74):          
    driver.get("http://www.amazon.com/Samsung-Galaxy-Tab-32GB-White/product-reviews/B010OTG3BQ/ref=cm_cr_getr_d_paging_btm_{page}\
    ?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('galaxy_tab1.csv', encoding='utf-8')

1.32) Galaxy Tab


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 150):          
    driver.get("http://static.amazon.com/Samsung-Galaxy-Tab-7-Inch-White/product-reviews/B00J8DL78O/ref=cm_cr_getr_d_paging_btm_{page}\
    ?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('galaxy_tab2.csv', encoding='utf-8')

1.33) Galaxy Tab


In [ ]:
driver = webdriver.Firefox()
for page in range(1, 50):          
    driver.get("http://www.amazon.com/Samsung-Galaxy-7-Inch-White-SM-T210/product-reviews/B00D02AGU4/ref=cm_cr_arp_d_paging_btm_{page}\
    ?ie=UTF8&showViewpoints=1&sortBy=helpful&pageNumber={page}".format(page=page))
    authors = driver.find_elements_by_css_selector("a.a-size-base.a-link-normal.author")
    dates = driver.find_elements_by_css_selector("div.a-fixed-right-grid-col.a-col-left span.a-size-base.a-color-secondary.review-date")
    helpss = driver.find_elements_by_css_selector("div.a-section.review span.cr-vote-buttons span.a-color-secondary")    
    reviews = driver.find_elements_by_css_selector("span.a-size-base.review-text")
    stars = driver.find_elements_by_css_selector('div.a-fixed-right-grid-col.a-col-left a.a-link-normal i.a-icon span.a-icon-alt')
    for helps in helpss:
        helps = helps.get_attribute('innerHTML').replace(",", "")
        if helps == 'Was this review helpful to you?':
            helpp = [0, 0]
        elif '</span>' in helps:
            continue
        elif 'people found this helpful' in helps:
            helpp = list([int(s) for s in helps.split() if s.isdigit()])
        df_help.loc[len(df_help)] = [helpp]
    for review in reviews:
        review = review.get_attribute('innerHTML')
        df_review.loc[len(df_review)] = [review]
    for author in authors:
        author = author.get_attribute('innerHTML')
        df_author.loc[len(df_author)] = [author]
    for date in dates:
        date = date.get_attribute('innerHTML').replace("on", "")
        df_date.loc[len(df_date)] = [date]
    for star in stars:
        star = star.get_attribute('innerHTML').split()[0]
        df_star.loc[len(df_star)] = [star]
df_help['review'] = df_review['review']
df_help['date'] = df_date['date']
df_help['author'] = df_author['author']
df_help['star'] = df_star['star']

In [ ]:
df_help['n_help'] = df_help['help'].apply(lambda x: x[0])
df_help['n_total'] = df_help['help'].apply(lambda x: x[1])
df_help['helpful score'] = df_help['n_help'] / df_help['n_total']

df = df_help[df_help["n_total"] >= 4]
df = df.drop('help', axis=1)

cols = df.columns.tolist()
cols = ['author', 'star', 'date', 'review', 'n_help', 'n_total', 'helpful score']
df = df[cols]

In [ ]:
df.to_csv('galaxy_tab3.csv', encoding='utf-8')