In [1]:
import pandas as pd
In [2]:
# Set up paths/ os
import os
import sys
this_path=os.getcwd()
os.chdir("../data")
sys.path.insert(0, this_path)
In [3]:
infile="AutismParentMagazine-posts.csv"
df = pd.read_csv(infile,index_col=0)
df.head(2)
Out[3]:
In [4]:
df['text']=df['text'].map(lambda x: x.replace("Continue Reading",""))
In [5]:
# Make categories, a list of categories in case of duplicates:
# Find list of unique titles
unique_titles=df['title'].unique()
# Find list of categories for each title:
dic_category={}
for title in unique_titles:
cat_list=df.loc[df['title']==title]['category'].values
# Remove repated elements, by converting to set and back to list
cat_set=set(cat_list)
cat_list=list(cat_set)
dic_category[title]=cat_list
In [6]:
# Check if there are repeated elements, and make category a list.
df=df.drop_duplicates('title')
df=df.reset_index(drop=True)
df.head(2)
Out[6]:
In [7]:
for ii in df.index:
title=df.loc[ii,['title']].values[0]
category=dic_category[title]
df.loc[ii,['category']]=str(category)
In [8]:
outfile="AutismParentMagazine-posts-clean.csv"
df.to_csv(outfile)