In [1]:
import requests # to make GET request
from bs4 import BeautifulSoup # to parse the HTML response
import time # to pause between calls
import pandas as pd # to see CSV
import os
os.chdir('../data/')
In [2]:
columns=['post id','title','text','href','user id','mother post id']
df = pd.DataFrame(columns=columns)
columns=['user id','user description']
df_users = pd.DataFrame(columns=columns)
# Initialize post index
post_id=0
In [3]:
def parse_post(title,url):
global df,df_users,post_id
response = requests.get(url)
page_source = response.text
soup = BeautifulSoup(page_source, 'html5lib')
post_data=soup.find("div", class_="post_message").text
post_answer=soup.find("div", id="post_answer_body")
post_entries=post_answer.find_all("div", class_="post_entry")
post_id_mother=post_id + 1
for post_entry in post_entries:
post_id = post_id + 1
subj_user=post_entry.find("div", class_="subj_user")
user_id=subj_user.find('a')['id']
user_name=subj_user.find('a').text
post_message=post_entry.find("div",class_="post_message").text
#
# Add post data to dataframe
#
newrow={"post id":post_id,
"title":title,
"text":post_message,
"href":href,
"user id":user_id,
"mother post id":post_id_mother}
df.loc[len(df.values)]=newrow
#
# Update user dataframe:
#
newrow={"user id":user_id,
"user description":user_name}
if user_id not in df_users['user id'].values:
df_users.loc[len(df_users)]=newrow
In [4]:
source="http://www.medhelp.org/forums/Autism--Aspergers-Syndrome/show/187"
for page in range(1,52):
page_suffix="/?page={}".format(str(page))
if ( page == 1 ):
page_suffix=''
url=source+page_suffix
response = requests.get(url)
page_source = response.text
soup = BeautifulSoup(page_source, 'html5lib')
medhelp_path="http://www.medhelp.org/"
subjects_list=soup.find("div",class_="subjects_list")
new_subject_elements=subjects_list.find_all("div", class_="new_subject_element float_fix")
for new_subject_element in new_subject_elements:
subject_summary=new_subject_element.find("div", class_="subject_summary")
# Get href. for further reading
href=subject_summary.find('a')['href']
href=medhelp_path+href
title=subject_summary.find('a').text
excerpt=subject_summary.find("div", class_="excerpt").text
#
# Now that we have extracted the title and href of each post
# Let's extract now the question and answers in that post.
#
parse_post(title,href)
time.sleep(1)
print(len(df))
In [5]:
df_users.to_csv('MedHelp-users.csv',index=False)
df.to_csv('MedHelp-posts.csv',index=False)
In [6]:
df_users
Out[6]:
In [7]:
print(len(df))
In [ ]: