In [1]:
import pandas as pd

In [2]:
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)

In [3]:
infile="AutismParentMagazine-posts.csv"
df = pd.read_csv(infile,index_col=0)
df.head(2)


Out[3]:
title source category text href
0 Autism, Head Banging and other Self Harming Be... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba For children with autism spectrum disorder (AS... https://www.autismparentingmagazine.com/autism...
1 High Quality ABA Treatment:  What Every Parent... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba Dr. Stephen Shore once said “If you’ve met one... https://www.autismparentingmagazine.com/high-q...

In [4]:
df['text']=df['text'].map(lambda x: x.replace("Continue Reading",""))

In [5]:
# Make categories, a list of categories in case of duplicates:

# Find list of unique titles
unique_titles=df['title'].unique()

# Find list of categories for each title:
dic_category={}
for title in unique_titles:
    cat_list=df.loc[df['title']==title]['category'].values
    # Remove repated elements, by converting to set and back to list
    cat_set=set(cat_list)
    cat_list=list(cat_set)
    dic_category[title]=cat_list
# Join the two very similar categories into one. cat1='category-general' cat2='category-autism-articles' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1 cat1='category-autism-therapy' cat2='category-applied-behavior-analysis-aba' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1 cat1='category-autism-and-diet' cat2='category-autism-and-food' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1

In [6]:
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates('title')
df=df.reset_index(drop=True)
df.head(2)


Out[6]:
title source category text href
0 Autism, Head Banging and other Self Harming Be... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba For children with autism spectrum disorder (AS... https://www.autismparentingmagazine.com/autism...
1 High Quality ABA Treatment:  What Every Parent... https://www.autismparentingmagazine.com/ category-applied-behavior-analysis-aba Dr. Stephen Shore once said “If you’ve met one... https://www.autismparentingmagazine.com/high-q...

In [7]:
for ii in df.index:
    title=df.loc[ii,['title']].values[0]
    category=dic_category[title]
    df.loc[ii,['category']]=str(category)

In [8]:
outfile="AutismParentMagazine-posts-clean.csv"
df.to_csv(outfile)