In [1]:

    
import pandas as pd



In [2]:

    
# Set up paths/ os
import os
import sys

this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)



In [3]:

    
infile="AutismParentMagazine-posts.csv"
df = pd.read_csv(infile,index_col=0)
df.head(2)









    Out[3]:







  
    
      
      title
      source
      category
      text
      href
    
  
  
    
      0
      Autism, Head Banging and other Self Harming Be...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      For children with autism spectrum disorder (AS...
      https://www.autismparentingmagazine.com/autism...
    
    
      1
      High Quality ABA Treatment:  What Every Parent...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      Dr. Stephen Shore once said “If you’ve met one...
      https://www.autismparentingmagazine.com/high-q...



In [4]:

    
df['text']=df['text'].map(lambda x: x.replace("Continue Reading",""))



In [5]:

    
# Make categories, a list of categories in case of duplicates:

# Find list of unique titles
unique_titles=df['title'].unique()

# Find list of categories for each title:
dic_category={}
for title in unique_titles:
    cat_list=df.loc[df['title']==title]['category'].values
    # Remove repated elements, by converting to set and back to list
    cat_set=set(cat_list)
    cat_list=list(cat_set)
    dic_category[title]=cat_list

# Join the two very similar categories into one. cat1='category-general' cat2='category-autism-articles' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1 cat1='category-autism-therapy' cat2='category-applied-behavior-analysis-aba' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1 cat1='category-autism-and-diet' cat2='category-autism-and-food' row_index=df.loc[df['category']==cat2].index for row in row_index: df.loc[row,['category']]=cat1



In [6]:

    
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates('title')
df=df.reset_index(drop=True)
df.head(2)









    Out[6]:







  
    
      
      title
      source
      category
      text
      href
    
  
  
    
      0
      Autism, Head Banging and other Self Harming Be...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      For children with autism spectrum disorder (AS...
      https://www.autismparentingmagazine.com/autism...
    
    
      1
      High Quality ABA Treatment:  What Every Parent...
      https://www.autismparentingmagazine.com/
      category-applied-behavior-analysis-aba
      Dr. Stephen Shore once said “If you’ve met one...
      https://www.autismparentingmagazine.com/high-q...



In [7]:

    
for ii in df.index:
    title=df.loc[ii,['title']].values[0]
    category=dic_category[title]
    df.loc[ii,['category']]=str(category)



In [8]:

    
outfile="AutismParentMagazine-posts-clean.csv"
df.to_csv(outfile)

	title	source	category	text	href
0	Autism, Head Banging and other Self Harming Be...	https://www.autismparentingmagazine.com/	category-applied-behavior-analysis-aba	For children with autism spectrum disorder (AS...	https://www.autismparentingmagazine.com/autism...
1	High Quality ABA Treatment: What Every Parent...	https://www.autismparentingmagazine.com/	category-applied-behavior-analysis-aba	Dr. Stephen Shore once said “If you’ve met one...	https://www.autismparentingmagazine.com/high-q...