In [1]:
import pandas as pd
In [3]:
file = r'.\data\reddit\ethereum\slim_sorted_comments.csv'
reader = pd.read_csv(file, chunksize=1000, header=0, index_col='commentId')
for df in reader:
# grab the first chunk and leave ...
break
reader.close()
In [ ]:
## Looking at the comment data
In [5]:
df.head(n = 10)
Out[5]:
In [ ]:
## finding deleted authors
In [8]:
df[df['author'] == '[deleted]'].head(n = 10)
Out[8]:
In [ ]:
## grouping authors by score (high/low)
In [18]:
df.groupby('author')['score'].sum().to_frame().reset_index().sort_values('score', ascending=False).head(n = 10)
Out[18]:
In [21]:
df.groupby('author')['score'].sum().to_frame().reset_index().sort_values('score', ascending=True).head(n = 10)
Out[21]:
In [ ]:
## grouping authors by # of comments
In [19]:
df.groupby('author')['postId'].count().to_frame().reset_index().sort_values('postId', ascending=False).head(n = 10)
Out[19]:
In [ ]:
## grouping authors by total comment length
In [28]:
df.groupby('author')['body'].sum().map(lambda x: len(x)).to_frame().reset_index().sort_values('body', ascending=False).head(n = 10)
Out[28]:
In [ ]: