In [1]:
import requests  # to make GET request
from bs4 import BeautifulSoup  # to parse the HTML response
import time  # to pause between calls
import pandas as pd  # to see CSV
import os

os.chdir('../data/')

In [2]:
columns=['post id','title','text','href','user id','mother post id']
df = pd.DataFrame(columns=columns)

columns=['user id','user description']
df_users = pd.DataFrame(columns=columns)

# Initialize post index
post_id=0

In [3]:
def parse_post(title,url):
    global df,df_users,post_id
    response = requests.get(url)
    page_source = response.text
    soup = BeautifulSoup(page_source, 'html5lib')
    post_data=soup.find("div", class_="post_message").text

    post_answer=soup.find("div", id="post_answer_body")
    post_entries=post_answer.find_all("div", class_="post_entry")
    post_id_mother=post_id + 1
    for post_entry in post_entries:
        post_id = post_id + 1
        subj_user=post_entry.find("div", class_="subj_user")
        user_id=subj_user.find('a')['id']
        user_name=subj_user.find('a').text
        post_message=post_entry.find("div",class_="post_message").text
        #
        # Add post data to dataframe
        #
        newrow={"post id":post_id,
                "title":title,
                "text":post_message,
                "href":href,
                "user id":user_id,
                "mother post id":post_id_mother}
        df.loc[len(df.values)]=newrow
        #
        # Update user dataframe:
        #
        newrow={"user id":user_id,
              "user description":user_name}
        if user_id not in df_users['user id'].values:
            df_users.loc[len(df_users)]=newrow

In [4]:
source="http://www.medhelp.org/forums/Autism--Aspergers-Syndrome/show/187"



for page in range(1,52):
    page_suffix="/?page={}".format(str(page))
    if ( page == 1 ):
        page_suffix=''
    url=source+page_suffix
    response = requests.get(url)
    page_source = response.text
    soup = BeautifulSoup(page_source, 'html5lib')
    medhelp_path="http://www.medhelp.org/"
    subjects_list=soup.find("div",class_="subjects_list")
    new_subject_elements=subjects_list.find_all("div", class_="new_subject_element float_fix")
    for new_subject_element in new_subject_elements:
        subject_summary=new_subject_element.find("div", class_="subject_summary")
        # Get href. for further reading
        href=subject_summary.find('a')['href']
        href=medhelp_path+href
        title=subject_summary.find('a').text
        excerpt=subject_summary.find("div", class_="excerpt").text
        # 
        # Now that we have extracted the title and href of each post
        # Let's extract now the question and answers in that post.
        #
        parse_post(title,href)
        time.sleep(1)
    print(len(df))


37
56
74
112
139
168
196
216
244
269
283
311
333
345
364
380
401
417
439
465
492
526
552
598
627
660
693
716
749
773
807
840
911
946
998
1060
1108
1163
1223
1268
1313
1342
1363
1401
1477
1510
1594
1669
1708
1778
1813

In [5]:
df_users.to_csv('MedHelp-users.csv',index=False)
df.to_csv('MedHelp-posts.csv',index=False)

In [6]:
df_users


Out[6]:
user id user description
0 user_340688 Rachel Thompson, Ph.D., BCBA
1 user_1566928 CirclesLady29
2 user_15010831 Rosseau
3 user_340657 Myrna Libby, Ph.D., BCBA
4 user_391640 babesmissy
5 user_388553 Grandmother1941
6 user_393618 fidgit
7 user_395246 sljenkins
8 user_347888 MaryannesMom
9 user_397233 aspiemom
10 user_436623 autistic_mom07
11 user_9757959 Alainee
12 user_340676 Jason C Bourret, Ph.D., BCBA-D
13 user_9351486 vincentcausse
14 user_9280615 mentor4succes
15 user_8221281 Pantx
16 user_6976850 Akita0419
17 user_1785966 Regret2011
18 user_7980329 Kamsthere
19 user_7329383 Hannahvictoria98
20 user_7037769 FollowerofChrist
21 user_4447834 gabrielamg
22 user_6554120 ZapCat
23 user_7154461 PresleyNic
24 user_6929994 Jane1211
25 user_6573002 Jellybean1986
26 user_6734966 amethyst111
27 user_6591769 lillibetlayne
28 user_6506090 megamomz6
29 user_6333487 exfulgere
... ... ...
475 user_518366 troisboyz
476 user_356327 angelinamarina
477 user_92911 LukeL
478 user_9995 wmac
479 user_380759 jaipur
480 user_333573 crayons
481 user_371209 barbsbit
482 user_282524 rebbecca
483 user_304011 jerry9798
484 user_368946 bweebles
485 user_337410 tomcat47
486 user_362088 aver
487 user_367032 EdieMarie
488 user_474322 Presidents
489 user_367308 JollyHolly1221
490 user_365670 bellemom
491 user_317629 BlueEgg
492 user_287540 littlebartie
493 user_361661 jrobb1564
494 user_596996 keloz
495 user_336901 Atto786
496 user_681771 smettoh
497 user_726652 eloisa7
498 user_359999 Danalou275
499 user_355729 PATTI143
500 user_364792 losifat
501 user_360460 TammyLynn1976
502 user_361230 beancounter68
503 user_212002 Susie2007
504 user_345311 Claudinne

505 rows × 2 columns


In [7]:
print(len(df))


1813

In [ ]: