In [1]:
import pandas as pd

In [2]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [3]:
infile="ehealthforum-posts.csv"
df = pd.read_csv(infile,index_col=0)
print(len(df))
df.head(2)


1118
Out[3]:
title text href user id mother post id
post id
1 Possible autism signs ? \nmy nephew who is four years has very bad moo... http://ehealthforum.com/health/possible-autism... 55473.0 0
2 Possible autism signs ? \nHi, welcome to the ehealth forum and I am gl... http://ehealthforum.com/health/possible-autism... 239324.0 0

In [4]:
df['text']=df['text'].map(lambda x: x.strip())
df.head(1)


Out[4]:
title text href user id mother post id
post id
1 Possible autism signs ? my nephew who is four years has very bad mood ... http://ehealthforum.com/health/possible-autism... 55473.0 0

In [5]:
text="This post has been removed because"

print(len(df))
df = df[df['text'].str.contains(text)==False]
print(len(df))


1118
1107

In [6]:
# Remove empty posts:
df =df[df['text'].str.len()  > 3 ]
print(len(df))


1060

In [7]:
outfile="ehealthforum-posts-clean.csv"
df.to_csv(outfile)